#!/usr/bin/Rscript

# Copyright (c) 2014,
# Mathias Kuhring, KuhringM@rki.de, Robert Koch Institute, Germany, 
# All rights reserved. For details, please note the license.txt.

# R script to filter a BLAT psl file for maximum matching hits
args <- commandArgs(trailingOnly = TRUE)

# prepare psl column names
pslnames <- "match mismatch repmatch Ns Qgapcount Qgapbases Tgapcount Tgapbases strand Qname Qsize Qstart Qend Tname Tsize Tstart Tend blockcount blockSizes qStarts tStarts"
pslnames <- strsplit(pslnames, split="[ \t]+")[[1]]	   					      	     				     				

# import the psl file
filename <- args[1]
pslheader <- readLines(filename, n=5)
psldata <- read.table(filename, sep="\t", skip=5)
colnames(psldata) <- pslnames

# sort psl data by query name and match count, keep entry with maximum match per query
sorted <- psldata[order(psldata$Qname, -psldata$match), ]
uniques <- sorted[!duplicated(sorted$Qname), ]

# export the filter psl data
filename <- args[2]
write(pslheader, filename)
write.table(uniques, filename, append=T, quote=F, sep="\t", row.names=F, col.names=F)
