Package core :: Module AnalyseMapping
[hide private]
[frames] | no frames]

Source Code for Module core.AnalyseMapping

   1  #!/usr/bin/python3 
   2  """ This script is used to evalulate the mapping results """ 
   3   
   4  import HTSeq 
   5  import cPickle as pickle 
   6  import numpy as np 
   7  import time 
   8  import sys 
   9  import re 
  10  import subprocess 
  11  import random  
  12  import os 
  13   
  14   
  15   
  16  alngts = 0 
  17  AlignedReadsdic = {} 
  18  algnedToRef = 0 
  19  algnedToArt = 0 
  20   
  21  """ Helper debug functions.... """ 
22 -def savepickle(dictionaary, outputname):
23 pickle.dump(dictionaary, open(outputname + ".p", "wb")) 24 print("Saved .pickle!")
25
26 -def loadpickle(inputname):
27 dictionary = pickle.load(open(inputname + ".p")) 28 print("Loaded " + inputname + ".pickle!") 29 return (dictionary)
30 31
32 -class CustomRead:
33 """ 34 Class for exhaustive reading of the sam alignments. 35 36 """ 37
38 - def __init__(self, readname, mr, nm, subs, score, mq, start, end, gaps, mism):
39 self.id = readname # read identifier 40 self.mr = [mr] # matched reference 41 self.subs =[subs] # substitutions in artificial reference genome 42 self.nm = [nm] # number of mismatches (according to SAM) 43 self.rq = score # read quality score 44 self.mq = [mq] # mapping quality 45 self.start = [start] # alignment start 46 self.end = [end] # alignment end 47 #mr,nm,subs,score,mq,start,end 48 self.gaps = [gaps] 49 self.mism = [mism]
50
51 - def toObjself(self, read):
52 self.mr.append(read.mr[0]) 53 self.subs.append(read.subs[0]) 54 self.nm.append(read.nm[0]) 55 self.mq.append(read.mq[0]) 56 self.start.append(read.start[0]) 57 self.end.append(read.end[0]) 58 self.gaps.append(read.gaps[0]) 59 self.mism.append(read.mism[0])
60
61 - def toStr(self, readname):
62 """Converts the object to a string """ 63 str = "" 64 65 for i, read in enumerate(self.mr): 66 #print read 67 str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.mr[i], self.subs[i], self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i]) 68 return(str) 69
70 - def toStrNoMem(self, identifier):
71 """Converts the object to a string """ 72 73 if identifier == "art": 74 str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 0, self.subs[0], self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0]) 75 else: 76 str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 1, 0, self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0]) 77 return(str) 78 79 80
81 -class TPRead:
82 """ Class for exhaustive reading of the sam alignments. Only for TP """
83 - def __init__(self, nm, score, mq, start, end, gaps, mism):
84 self.nm = [nm] # number of mismatches (according to SAM) 85 self.rq = score # read quality score 86 self.mq = [mq] # mapping quality 87 self.start = [start] # alignment start 88 self.end = [end] # alignment end 89 self.gaps = [gaps] 90 self.mism = [mism]
91
92 - def toObjself(self, read):
93 self.nm.append( read.nm[0]) 94 self.mq.append(read.mq[0]) 95 self.start.append( read.start[0]) 96 self.end.append( read.end[0]) 97 self.gaps.append( read.gaps[0]) 98 self.mism.append(read.mism[0])
99
100 - def toStr(self, readname):
101 str = "" 102 for i, read in enumerate(self.mr): 103 str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i]) 104 return(str) 105 106 """ returns 1 if an equal alignment is contained in the array """
107 - def isContained(self, read):
108 for i, position in enumerate(self.start): 109 # Checks if a given Read (more accurate the alignment) is already in the object. this is used to check against FP 110 if position == read.start and self.end[i] == read.end: 111 return(1) 112 return(0)
113 114
115 -class ReadID:
116 """ Class for efficient addressing for np. arrays! """
117 - def __init__(self, internalID, quality):
118 self.internalID = internalID 119 self.quality = quality
120 121 122 ############################################# CLASSES END########################################### 123 124
125 -def getTPRead(alngt, compareList, readdic):
126 """ 127 Funtion which transforms an HTSeq alignment to a TPRead class. 128 129 @type alngt: alignment 130 @param alngt: alignment from the sam file. 131 @type compareList: list 132 @param compareList: list which indicates differences between reference / artificial 133 @type readdic: dictionary 134 @param readdic: Contains ranks and qualities for every entry in the fastq file 135 136 @rtype: TPRead 137 @return: Transformed ReadObject. 138 """ 139 cigar = alngt.cigar 140 gaps = sum([i.size for i in cigar if i.type=="I" or i.type=="D"]) 141 mism = sum([i.size for i in cigar if i.type=="M"]) - sum([int(s) for s in alngt.optional_field("MD") if s.isdigit()]) 142 nm = alngt.optional_field("NM") 143 144 #####->TPRead(nm, score, mq, start, end, gaps, mism, n) 145 return (TPRead(nm, readdic[alngt.read.name], alngt.aQual, alngt.iv.start, alngt.iv.end, gaps, mism))
146
147 -def getCustomRead(alngt, compareList, readdic):
148 """ 149 Funtion which transforms an HTSeq alignment to a CustomRead class. 150 151 @type alngt: alignment 152 @param alngt: alignment from the sam file. 153 @type compareList: list 154 @param compareList: list which indicates differences between reference / artificial 155 @type readdic: dictionary 156 @param readdic: Contains ranks and qualities for every entry in the fastq file 157 158 @rtype: CustomRead 159 @return: Transformed ReadObject. 160 161 """ 162 cigar = alngt.cigar 163 # number of gaps is the sum of "I" (insertions) and "D" deletions. 164 gaps = sum([i.size for i in cigar if i.type=="I" or i.type=="D"]) 165 166 # mismatches are calculated as cigar XM - MDtag occurrences of numbers 167 # which indicate mismatches 168 mism = sum([i.size for i in cigar if i.type=="M"]) - getsum(re.findall("(\d+)", alngt.optional_field("MD"))) 169 170 171 nm = alngt.optional_field("NM") 172 173 ######## 174 if len(alngt.read.qualstr) <= 1: 175 176 return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),readdic[alngt.read.name],alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism)) 177 else: 178 return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),(alngt.read.qual.mean() / (alngt.read.qual.max() - alngt.read.qual.min() + 1)),alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism))
179
180 -def GetOrderDictionary(referenceSAM):
181 """ 182 Function to get a dictionary containing the rank of a read (given a sorted samfile by readname, samtools). 183 184 @type referenceSAM: string 185 @param referenceSAM: Inputfile name for reference SAM file. 186 @rtype: dictionary 187 @return: Internalnaming according to the sorting of samtools. Key = ReadID, Value = rank 188 """ 189 internalDic = {} 190 191 # use ultrafast awk for getting only readnames from aligned fasta file 192 readnames = str(subprocess.check_output("""awk < %s -F "\t" '{print $1 }'""" % referenceSAM, shell=True)) 193 194 # make to list 195 readnames = readnames.split("\n") 196 197 i = 0 198 # skip header files 199 while readnames[i][0] == ("@"): 200 i += 1 201 202 index = 0 203 for k in xrange(i, len(readnames)): 204 if readnames[k] in internalDic: 205 pass 206 else: 207 internalDic[readnames[k]] = index 208 #enumeration of the index 209 index += 1 210 211 212 213 return(internalDic)
214 215
216 -def getNextLine(textfile, compareList, readdic):
217 """ 218 Function which returns the next line from a SAMFile. 219 220 @type textfile: fileobject stream 221 @param textfile: SAMfile 222 @type compareList: list 223 @param compareList: AA sequence of the reference genome. 224 @type readdic: Dictionary 225 @param readdic: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed) 226 @rtype: Readobj 227 @return: Parsed read from text line. 228 """ 229 230 # dummy is needed because nomem and standart reading return "different" values. 231 # dummy, equals readname 232 line = textfile.readline() 233 if not line: 234 return(0) 235 CRead, dummy = readSAMline(line, "noMem", compareList, readdic) 236 return(CRead)
237 238 239
240 -def getMisGap(mdtag, cigar):
241 """ 242 Reads the alignment tag given, the text and a tag to search for 243 244 @type mdtag: string 245 @param mdtag: MDTag from alignment 246 @type cigar: string 247 @param cigar: Cigar from alignment 248 @rtype: gaps,mismatches 249 @return: Parsed gaps and mismatches 250 """ 251 deletions = getsum(re.findall("(\d+)D", cigar)) 252 insertions = getsum(re.findall("(\d+)I", cigar)) 253 gaps = int(insertions + deletions) 254 255 # mismatches are: 256 # ALL Matches from CIGAR - RealMatches from MDTag 257 # 258 259 mismatch = getsum(re.findall("(\d+)M", cigar)) - getsum(re.findall("(\d+)", mdtag)) 260 261 262 #print ("%s\t%s\t%s\t%s\t%s\t%s" %(mdtag,cigar,deletions,insertions,mismatch,gaps)) 263 return(gaps, mismatch)
264
265 -def getAllID(textfile, read, compareList, readdic):
266 """ 267 Reads all alignments for the current read which are in the SAM file (sorted). If a new read ID is scanned the results are returned. 268 269 @type textfile: fileobject stream 270 @param textfile: SAM file for reading. 271 @type read: read obj 272 @param read: the last read obj which defines the current read id 273 @type readdic: dictionary 274 @param readdic: Look up for quality values. 275 @rtype: (bool,list,bool) 276 @return: a "triple", where 2 bools are defined as indicator variables and a list with all alignments for one read. 277 """ 278 # end of file? 279 if read == 0: 280 #parse last read and return 281 CRead = getNextLine(textfile, compareList, readdic) 282 return(getAllID(textfile, CRead, compareList, readdic)) 283 else: 284 # start ne list and append read which was read with the last run of getALLID 285 templist = [] 286 templist.append(read) 287 # get all reads with same id as "read" 288 while 1: 289 line = textfile.readline() 290 # no text left infile 291 if not line: 292 return(0, templist, 0) 293 294 try: 295 CRead = getCustomRead(HTSeq.SAM_Alignment.from_SAM_line(line), compareList, readdic) 296 #CRead, dummy = readSAMline(line, "noMem", compareList, readdic) 297 except: 298 CRead, dummy = readSAMline(line, "noMem", compareList, readdic) 299 300 if CRead == 0: 301 # 302 return(1, templist, CRead) 303 # same read in current line as was in the last line --> continue reading 304 if CRead.id == read.id: 305 templist.append(CRead) 306 else: 307 return(1, templist, CRead)
308 309
310 -def CompareAlignments(reflist, artlist,file):
311 """ 312 Compares alignments for the reference and artificial reference for a specific read id. 313 The goal is to identify false positives. 314 315 @type reflist: read obj list 316 @param reflist: list containing alignments witht the same ID (reference) 317 @type artlist: read obj list 318 @param artlist: list containing alignments witht the same ID (artificial) 319 @rtype: np.array 320 @return: indices of unique alignments 321 """ 322 323 324 artlen = len(artlist) 325 reflen = len(reflist) 326 327 TPcount = reflen 328 FPcount = 0 329 for RefRead in reflist: 330 file.write(RefRead.toStrNoMem("ref")) 331 332 # add 1 for every alignment from the artificial that is found on the reference 333 for i in xrange(0, artlen): 334 # bool which decides if artificial in reference hit 335 ArtInRef = 0 336 for j in xrange(0,reflen): 337 if artlist[i].start == reflist[j].start and artlist[i].end == reflist[j].end: 338 ArtInRef = 1 339 break 340 # hit on artificial not confirmed on reference --> FP 341 if (ArtInRef == 0): 342 FPcount+=1 343 file.write(artlist[i].toStrNoMem("art")) 344 345 346 return(TPcount,FPcount)
347 348 """def WriteToFile(file, list, identifier): 349 350 Writes read obj. to a file. 351 352 353 @type file: read obj list 354 @param file: list containing alignments witht the same ID (reference) 355 @type list: read obj. list 356 @param list: list containing alignments witht the same ID (artificial) 357 @type identifier: string 358 @param identifier: indicator if the results are from the reference or artificial 359 360 file.write(getBestAlignment(list).toStrNoMem(identifier))""" 361
362 -def SkipHeader(file, compareList, readdic):
363 """ 364 Skips the header from a SAM file, but reads the first line of the alignment section. 365 366 @type file: read obj list 367 @param file: list containing alignments witht the same ID (reference) 368 @type compareList: list 369 @param compareList: list for accumulation of the same read id 370 @type readdic: dictionary 371 @param readdic: dictionary containing read ID - read quality mappings. 372 @rtype: read obj. 373 @return: Returns a read obj. from the SAM file. 374 """ 375 while 1: 376 temp = file.readline() 377 if temp.startswith("@"): 378 pass 379 else: 380 return(readSAMline(temp, "noMem", compareList, readdic)) 381 break
382 383
384 -def getRanks(RefRead,ArtRead,rankdic):
385 """ 386 Function which returns the ranks of 2 given readIDs (read from reference,read from artificial). 387 388 @type RefRead: read obj 389 @param RefRead: read obj (reference) 390 @type ArtRead: read obj 391 @param ArtRead: read obj (artificial) 392 @type rankdic: dictionary 393 @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files). 394 @rtype: int,int 395 @return: returns the false positives and true positives for a SAM file pair (reference, artificial) 396 """ 397 # get the ranks resulting from the sorted samfile. 398 RefReadRank = rankdic[RefRead[0].id] 399 if ArtRead[0].id in rankdic: 400 ArtReadRank = rankdic[ArtRead[0].id] 401 else: 402 # if this ReadID is not in the dictionary then it MUST be a FP! 403 ArtReadRank = RefReadRank - 1 404 405 return(RefReadRank,ArtReadRank)
406 407
408 -def ReadSAMnoMem(ref, art, output, compareList, readdic, rankdic):
409 """ 410 Main function for comparing to mappings. This functions takes the complete alignments for artificial and reference genome and goes through them in parallel. 411 Since the mappings are sorted the function alternates the parsing of the samfiles in such a way that no memory is used for comparing these functions. 412 413 @type ref: string 414 @param ref: path to reference alignments (SAM file) 415 @type art: string 416 @param art: path to artificial alignments (SAM file) 417 @type output: read obj 418 @param output: read obj (artificial) 419 @type compareList: read obj 420 @param compareList: read obj (artificial) 421 @type readdic: dictionary 422 @param readdic: dictionary containing read ID - read quality mappings. 423 @type rankdic: dictionary 424 @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files). 425 @rtype: ranks 426 @return: Returns the ranks for the 2 read ids. 427 """ 428 429 fobj = open(output, "w") 430 fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNMtag\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n") 431 fileref = open(ref, "r") 432 fileart = open(art, "r") 433 434 # SKip Header section of SAM file 435 CurrentReadRef, dummy = SkipHeader(fileref, compareList, readdic) 436 CurrentReadArt, dummy = SkipHeader(fileart, compareList, readdic) 437 438 # get first readIDs for looping 439 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 440 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 441 # variables for counting 442 443 tp = 0 444 fp = 0 445 MappedReads = 0 446 i = 0 447 start = time.time() 448 #Indicator variables which file is end first 449 ArtBool = 0 450 RefBool = 0 451 while 1: 452 i += 1 453 if i == 100000: 454 print "\t\t%f" % (time.time() - start), 455 456 RefReadRank,ArtReadRank = getRanks(RefRead,ArtRead,rankdic) 457 458 #case 1) 459 # Keep reading lines from artificial if the id is < than the one from the reference 460 # this means unique hits to the artificial! 461 #while (RefRead[0].id > ArtRead[0].id and hasNextLine != 0): 462 while (RefReadRank > ArtReadRank and hasNextLine != 0): 463 # in case of multiple hits, set all mr variables to 0 and get the best alignment 464 for l in xrange(0, len(ArtRead)): 465 ArtRead[l].mr = [0] 466 467 # write FP to file 468 for hits in ArtRead: 469 fobj.write(hits.toStrNoMem("art")) 470 fp += 1 471 472 473 # use akku for read lines 474 CurrentReadArt = NextReadArt 475 #get all the reads with the same ID into ArtRead 476 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 477 478 #update rank 479 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 480 MappedReads+=1 481 482 #case 2) 483 # keep reading lines from reference if id is < art.id 484 # all these hits are purely TP 485 #while (RefRead[0].id < ArtRead[0].id and hasNextLine != 0): 486 while (RefReadRank < ArtReadRank and hasNextLine != 0): 487 488 for hits in RefRead: 489 fobj.write(hits.toStrNoMem("ref")) 490 tp += 1 491 492 CurrentReadRef = NextReadRef 493 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 494 RefReadRank = rankdic[RefRead[0].id] 495 MappedReads+=1 496 497 498 # case 3) 499 #if (RefRead[0].id == ArtRead[0].id): 500 if (RefReadRank == ArtReadRank): 501 MappedReads+=1 502 503 #same read.id --> compare all Alignments for one read id (can be multiple alignments) 504 # return unique hits to artificial genome 505 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 506 tp +=tempTP 507 fp +=tempFP 508 # overwrite akku 509 CurrentReadRef = NextReadRef 510 CurrentReadArt = NextReadArt 511 512 513 # get next Read for next ID and check if file has a next line 514 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 515 if hasNextLine == 0: 516 #print "REFBREAK" 517 RefReadRank = rankdic[RefRead[0].id] 518 RefBool = 1 519 break 520 521 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 522 if hasNextLine == 0: 523 #print "ARTBREAK" 524 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 525 ArtBool = 1 526 break 527 528 # if the while loop was broken one read has to be read in the not closed file 529 hasNextLine = 1 530 # the file which has still some lines is now processed linearly... 531 if RefBool == 1: 532 #print ("REF FILE FINISHED!!!") 533 # reference no lines --> read artificial 534 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 535 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 536 537 while (RefReadRank > ArtReadRank and hasNextLine != 0): 538 MappedReads+=1 539 # in case of multiple hits, set all mr variables to 0 and get the best alignment 540 for l in xrange(0, len(ArtRead)): 541 ArtRead[l].mr = [0] 542 543 # write current FP to file 544 # write FP to file 545 for hits in ArtRead: 546 fobj.write(hits.toStrNoMem("art")) 547 fp += 1 548 # use akku for read lines 549 CurrentReadArt = NextReadArt 550 #get all the reads with the same ID into ArtRead 551 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 552 553 #update rank 554 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id] 555 556 557 558 if (RefReadRank == ArtReadRank): 559 MappedReads+=1 560 #same read.id --> compare all Alignments for one read id (can be multiple alignments) 561 # return unique hits to artificial genome 562 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 563 tp +=tempTP 564 fp +=tempFP 565 566 567 while 1: 568 569 if hasNextLine == 0: 570 break 571 else: 572 573 CurrentReadArt = NextReadArt 574 # read next artificial read! 575 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic) 576 577 578 for hits in ArtRead: 579 fobj.write(hits.toStrNoMem("art")) 580 fp += 1 581 MappedReads+=1 582 583 else: 584 # read again reference file as long as the final read from he artificial file is reached 585 while (RefReadRank < ArtReadRank and hasNextLine != 0): 586 MappedReads+=1 587 588 for hits in RefRead: 589 fobj.write(hits.toStrNoMem("ref")) 590 tp += 1 591 CurrentReadRef = NextReadRef 592 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 593 RefReadRank = rankdic[RefRead[0].id] 594 595 596 if (RefReadRank == ArtReadRank): 597 MappedReads+=1 598 #same read.id --> compare all Alignments for one read id (can be multiple alignments) 599 # return unique hits to artificial genome 600 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj) 601 tp +=tempTP 602 fp +=tempFP 603 604 605 else: 606 MappedReads+=1 607 for hits in ArtRead: 608 fobj.write(hits.toStrNoMem("art")) 609 fp += 1 610 611 while 1: 612 if hasNextLine == 0: 613 MappedReads+=1 614 for hits in RefRead: 615 fobj.write(hits.toStrNoMem("ref")) 616 tp += 1 617 618 break 619 else: 620 CurrentReadRef = NextReadRef 621 MappedReads+=1 622 for hits in RefRead: 623 fobj.write(hits.toStrNoMem("ref")) 624 tp += 1 625 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic) 626 627 end = time.time() 628 print ("\t\t%f\t\t" %(end - start)), 629 fobj.close() 630 return(tp, fp,tp+fp,MappedReads)
631 ##################END: FUNCTIONS FOR NO-MEMORY ANALYSIS #################### 632 633
634 -def getNumberOf(line, tag):
635 """ 636 Reads the alignment tag, given the text and a tag to search for. 637 638 @type line: string 639 @param line: SAM line 640 @type tag: string 641 @param tag: SAM tag. i.e: NM,MD 642 @rtype: int 643 @return: number x behind desired tag tag:i:x 644 """ 645 646 # gives the number of XX back 647 # valid tags: X0,XM (number of missmatches 648 m = re.search(tag + ":i:(\d+)", line) 649 besthits = m.group(1) 650 651 return(besthits)
652
653 -def getsum(strlist):
654 """ 655 Sum of strings (as ints) 656 657 @type strlist: list(str) 658 @param strlist: SAM line 659 660 @rtype: int 661 @return: MD tag calculation. 662 """ 663 664 return(sum([int(i) for i in strlist]))
665
666 -def getmean(strlist):
667 """ 668 Mean of strings. 669 670 @type strlist: list(str) 671 @param strlist: SAM line 672 673 @rtype: float 674 @return: mean 675 """ 676 sum = getsum(strlist) 677 return(sum/float(len(strlist)))
678 679
680 -def ComputeRQScore(quality):
681 """ 682 Computes the ReadQualityScore given a quality string 683 684 @type quality: string 685 @param quality: quality string of a read. 686 687 @rtype: float 688 @return: ReadQualityScore (RQS) 689 """ 690 # phred --> -33 691 qualarray = [ord(i)-33 for i in quality] 692 # solexa --> -64 693 score =( sum(qualarray)/float(len(qualarray))) / (max(qualarray) - min(qualarray) + 1) 694 695 return (score)
696 697
698 -def getAlignmentLength(cigar):
699 """ 700 Computes the alignment length given a cigar string. Needed for Start + End calculation. 701 702 @type cigar: string 703 @param cigar: Cigar string (SAM) 704 705 @rtype: int 706 @return: alignmentlength 707 """ 708 # I - insertions do not extend the alignment 709 # S - does not extend... 710 # P - does not extend... 711 # H - does not extend... 712 match = re.findall("(\d+)[MDN]", cigar) 713 sumOfMatches = 0 714 for item in match: 715 sumOfMatches += int(item) 716 return (sumOfMatches)
717 718
719 -def isSaneAlignment(alignment, identifier, compareList, readdic):
720 """ 721 Checks alignment line for unnecessary informations to skip 722 723 @type alignment: string 724 @param alignment: Line from SAM 725 @type identifier: string 726 @param identifier: read id 727 @type compareList: list 728 @param compareList: list of alignments with same read id. 729 @type readdic: dictionary 730 @param readdic: Dictionary containg a read id; read quality mapping 731 @rtype: readobj, readname 732 @return: Returns the read and it's identifier. 733 """ 734 735 if alignment.startswith("@"): 736 return(0, "") 737 else: 738 read, readname = readSAMline(alignment, identifier, compareList, readdic) 739 if read == 0: 740 return(0, "") 741 else: 742 return (read, readname)
743 744
745 -def CheckForSameAlignments(readref, readart):
746 """ 747 Function for comparison of artificial alignments and reference alignments. FP are defined such that start and end position must be unique to the artificial reference 748 returns 0 if no same read is found (FP found) 749 returns 1 if an equal alignment is found 750 751 @type readref: read obj. 752 @param readref: reference 753 @type readart: read obj. 754 @param readart: artificial 755 @rtype: bool 756 @return: Indicator if alignment is the same (start & end equal) 757 """ 758 for i, item in enumerate(readref.start): 759 if (readref.start[i] == readart.start[0] and readref.end[i] == readart.end[0]): 760 return(1) 761 return(0)
762 763 764
765 -def getHammingdistance(CompareString, start, end):
766 """ 767 Computes the number of subsitutions in the artificial reference using the CompareString. 768 769 @type CompareString: string 770 @param CompareString: string of 0 and 1s. 1 = hamming 1 between reference and artificial. 771 @type start: int 772 @param start: start of alignment 773 @type end: int 774 @param end: end ofalignment 775 @rtype: int 776 @return: hamming distance 777 """ 778 return (sum(CompareString[start:end]))
779 780
781 -def readReadQualities(fastqfile):
782 """ 783 784 Reads a .fastqfile and calculates a defined readscore 785 input: fastq file 786 output: fastq dictionary key = readid; value = qualstr 787 788 789 @type fastqfile: string 790 @param fastqfile: path to fastq file 791 792 @rtype: dictionary 793 @return: dictionary containing read ids and read qualities. 794 """ 795 fastq_file = HTSeq.FastqReader(fastqfile , "phred") 796 readdictionary = {} 797 798 for read in fastq_file: 799 readdictionary[read.name] = ComputeRQScore(read.qualstr) 800 print("\tReading Fastq file done!") 801 return readdictionary
802 803 804 805
806 -def extendReadDic(readdic):
807 """ 808 Extends a given dictionary with KEY = readid and VALUE = qualstr such that an internal naming is generated which can be used to efficiently create an numpy array 809 810 811 @type readdic: dictionary 812 @param readdic: dictionary containing read ids and read qualities. 813 814 @rtype: dictionary 815 @return: extended readdic with KEY = ID, VALUE = READID object with READID.internalid and READID.qulstr = qualstr 816 """ 817 internalnaming = 0 818 reverseReadDic = np.zeros(len(readdic), dtype='S50') 819 for id in readdic.iterkeys(): 820 # i 821 readdic[id] = ReadID(internalnaming, readdic[id]) 822 reverseReadDic[internalnaming] = id 823 internalnaming += 1 824 return(readdic, reverseReadDic)
825 826 827
828 -def returnSequence(fasta):
829 """ 830 Returns a sequence string from a fasta file. 831 832 833 @type fasta: string 834 @param fasta: path to fasta file. 835 836 @rtype: string 837 @return: sequence 838 """ 839 fastafile = HTSeq.FastaReader(fasta) 840 for sequence in fastafile: 841 return(sequence.seq)
842 843 844
845 -def CreateCompareList(Reference, ARG):
846 """ 847 Creates a list which is used for comparisons between aligned reads (exact number of mismatches) 848 849 850 851 @type Reference: string 852 @param Reference: reference genome 853 @type ARG: string 854 @param ARG: artificial reference genome. 855 @rtype: list 856 @return: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 857 """ 858 859 reference = returnSequence(Reference) 860 artificialreference = returnSequence(ARG) 861 complist = [] 862 863 864 if (len(reference) != len(artificialreference)): 865 print "first 10 letter:\t\t" + reference[0:10] + "..." + reference[-10:] 866 print "last 10 letter:\t\t" + artificialreference[0:10] + "..." + artificialreference[-10:] 867 print ("Error! Two Sequences have different length! Try to add a line break after the last nucleotide.") 868 sys.exit(1) 869 else: 870 return([ 0 if x == artificialreference[i] else 1 for i, x in enumerate(reference)])
871 872 873
874 -def readSAMline(alignment, identifier, compareList, readdic):
875 """ 876 Function for reading SAM alignment text file (one line) 877 878 879 880 @type alignment: string 881 @param alignment: SAM alignment 882 @type identifier: string 883 @param identifier: read id 884 @type compareList: list 885 @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 886 @type readdic: dictionary 887 @param readdic: dictionary containing read ids and read qualities. 888 @rtype: read obj. 889 @return: returns a customRead object 890 """ 891 k = 0 892 893 # ignore header 894 columns = alignment.split("\t") 895 flag = int(columns[1].strip()) 896 897 # flag 4 = not mapped to any genome 898 if flag == 4: 899 return(0, "") 900 901 else: 902 903 904 readname = columns[0].strip() 905 reference = columns[2].strip() 906 start = int(columns[3].strip())-1 907 mappingquality = columns[4].strip() 908 cigar = columns[5].strip() 909 qualstr = columns[10].strip() 910 tags = " ".join(columns[11:]) 911 912 mdtag = re.search("MD:Z:([^\s]+)", tags).group(1) 913 try: 914 mdtag = re.search("MD:Z:([^\s]+)", tags).group(1) 915 gaps, mism = getMisGap(mdtag, cigar) 916 except: 917 gaps = 0 918 mism = 0 919 print "Error Reading MDtag of %s.Setting gaps = 0,mism = 0" % readname 920 try: 921 nm = int(getNumberOf(tags, "NM")) 922 except: 923 nm = 0 924 925 leng = getAlignmentLength(cigar) 926 927 if len(qualstr) == 1 or (len(qualstr.strip()) == 0): 928 score = readdic[readname].quality 929 else: 930 score = ComputeRQScore(qualstr) 931 932 if identifier == "art": 933 #tempobj = CustomRead(0,nm, getHammingdistance(compareList, start, start+leng), score, mappingquality,start,start+leng) 934 tempobj = CustomRead(readname, 0, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism) 935 elif identifier == "noMem": 936 tempobj = CustomRead(readname, 1, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism) 937 938 else: 939 ##########TPRead(nm, score, mq, start, end, gaps, mism) 940 tempobj = TPRead(nm, score, mappingquality, start, start + leng, gaps, mism) 941 942 943 944 return(tempobj, readname)
945 946 947 948 949
950 -def returnIndex(readdic, readname):
951 """ 952 Returns the index of a read. The index is prescribed by the ordering in the sam file. 953 @type readname: string 954 @param readname: read id 955 @type readdic: dictionary 956 @param readdic: dictionary containing read ids and read qualities.ArtRead 957 @rtype: int 958 @return: index 959 """ 960 return(readdic[readname].internalID)
961 962
963 -def ReadArtificialSAMfileHTSeq(art, compareList, RefArray, readdic):
964 """ 965 Function for reading the artificial reference genome using HTSeq.This function is mainly used. Only if no quality string is in the SAM line. The custom 966 SAM reading function is used. 967 968 @type art: string 969 @param art: artificial file. 970 @type RefArray: array 971 @param RefArray: Results from reading the reference SAM file. 972 @type compareList: list 973 @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal. 974 @type readdic: dictionary 975 @param readdic: dictionary containing read ids and read qualities. 976 @rtype: array 977 @return: aligned read objects in an array. 978 """ 979 start = time.time() 980 #print ("\tARTIFICIAL:") 981 fobj = open(art, "r") 982 artdic = {} 983 k = 0 984 985 read = SkipHeader(fobj,compareList,readdic) 986 987 988 for alignment in fobj: 989 k += 1 990 if k % 1000000 == 0: 991 print ("%d.." %(k/1000000)), 992 read, readname = isSaneAlignment(alignment, "art", compareList, readdic) 993 if read == 0: 994 pass 995 else: 996 # Check by internal naming if spot in array is taken (!= 0) by a read 997 index = returnIndex(readdic, readname) 998 if RefArray[index] != 0: 999 # read already in dic? Check if the alignments are the same 1000 if (RefArray[index].isContained(read) == 0): 1001 if readname in artdic: 1002 artdic[readname].toObjself(read) 1003 1004 else: 1005 artdic[readname] = read 1006 else: 1007 pass 1008 1009 else: 1010 # new read? Just add it to the dictionary 1011 artdic[readname] = read 1012 1013 1014 fobj.close() 1015 end = time.time() 1016 #print("\r\n") 1017 print ("\t %f " % (end - start)), 1018 #print ("\tdone in %d seconds" % (end-start)) 1019 return(artdic)
1020 1021 1022
1023 -def writeToTabArt(ReadDic, outfile):
1024 """ 1025 Write function for hits ont he reference genome. Only the best alignment (least mismatches) 1026 Header : 1027 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality 1028 1029 1030 @type outfile: string 1031 @param outfile: Path to outfile. 1032 @type ReadDic: dictionary 1033 @param ReadDic: dictionary containing read ids and read qualities. 1034 """ 1035 1036 fobj = open(outfile, "a") 1037 for read in ReadDic.keys(): 1038 BestReadIndex = np.array(ReadDic[read].nm).argmin() 1039 fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (read, ReadDic[read].mr[BestReadIndex], ReadDic[read].subs[BestReadIndex], ReadDic[read].nm[BestReadIndex], ReadDic[read].rq, ReadDic[read].mq[BestReadIndex], ReadDic[read].start[BestReadIndex], ReadDic[read].end[BestReadIndex], ReadDic[read].gaps[BestReadIndex], ReadDic[read].mism[BestReadIndex])) 1040 fobj.close()
1041 1042 1043 1044
1045 -def writeToTabRef(RefArray, outfile, reverseReadArray, ReadDic):
1046 """ 1047 Write function for hits ont he reference genome. Only the best alignment (least mismatches) 1048 1049 Header : 1050 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality 1051 rows in the infile 1052 @type RefArray: string 1053 @param RefArray: path to inputfile. 1054 @type outfile: int 1055 @param outfile: rows in the infile 1056 @type reverseReadArray: dictionary 1057 @param reverseReadArray: Contains reads = values and ranks = keys 1058 @type ReadDic: dictionary 1059 @param ReadDic: dictionary containing read ids and read qualities. 1060 """ 1061 1062 fobj = open(outfile, "a") 1063 for i in xrange(0, len(RefArray)): 1064 read = RefArray[i] 1065 # check if entry == 0 1066 if read == 0: 1067 continue 1068 else: 1069 readname = reverseReadArray[i] 1070 index = returnIndex(ReadDic, readname) 1071 BestReadIndex = np.array(RefArray[index].nm,dtype="int").argmin() 1072 fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, 1, 0, RefArray[index].nm[BestReadIndex], RefArray[index].rq, RefArray[index].mq[BestReadIndex], RefArray[index].start[BestReadIndex], RefArray[index].end[BestReadIndex], RefArray[index].gaps[BestReadIndex], RefArray[index].mism[BestReadIndex])) 1073 fobj.close()
1074 1075 1076
1077 -def ReadFromTab(infile, arraysize):
1078 """ 1079 Reads the results from writeToTab in for plotting and analysis 1080 1081 Header : 1082 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality 1083 1084 @type infile: string 1085 @param infile: path to inputfile. 1086 @type arraysize: int 1087 @param arraysize: rows in the infile 1088 1089 @rtype: np.array(obj) 1090 @return: array for unique classified reads. 1091 """ 1092 1093 1094 fobj = open(infile, "r") 1095 dataArray = np.arange(arraysize, dtype=object) 1096 for i, line in enumerate(fobj): 1097 if line.startswith("#"): 1098 continue 1099 else: 1100 zeile = line.split("\t") 1101 id = zeile[0] 1102 mr = int(zeile[1]) 1103 subs = int(zeile[2]) 1104 nm = int(zeile[3]) 1105 rq = float(zeile[4]) 1106 mq = int(zeile[5]) 1107 gaps = int(zeile[8]) 1108 mism = int(zeile[9]) 1109 #n = int(zeile[10].strip()) 1110 # id is not needed for the analysis 1111 dataArray[i - 1] = CustomRead(id, mr, nm, subs, rq, mq, 0, 0, gaps, mism) 1112 1113 return (dataArray)
1114 1115 1116
1117 -def readInput(file):
1118 """ 1119 Function for reading in the controldictionary from the inputfile. 1120 1121 @type file: string 1122 @param file: Path to the input file. 1123 1124 @rtype: files 1125 @return: Controldictionary which is used to regulate the programs workflow. 1126 1127 """ 1128 fobj = open(file, "r") 1129 files = {} 1130 files["mapper"] = {} 1131 files["fasta"] = {} 1132 tempref = [] 1133 temparg = [] 1134 reffasta = "" 1135 artfasta = [] 1136 1137 for line in fobj: 1138 # reference 1139 if line.startswith("$"): 1140 files["fasta"]["reffasta"] = GetCompletePath(line.split(":")[1].strip()) 1141 # artificial 1142 if line.startswith("#"): 1143 artfasta.append(GetCompletePath(line.split(":")[1].strip())) 1144 # fastq file 1145 if line.startswith("&"): 1146 files["fastqfile"] = GetCompletePath(line.split(":")[1].strip()) 1147 # start sequence for mapper section 1148 if line.startswith("@"): 1149 zeile = line.split("\t") 1150 mapper = zeile[0][1:].strip() 1151 files["fasta"]["artfasta"] = artfasta 1152 if line.startswith("ref"): 1153 tempref.append(GetCompletePath(line.split(":")[1].strip())) 1154 if line.startswith("art"): 1155 temparg.append(GetCompletePath(line.split(":")[1].strip())) 1156 # end of mapper section 1157 if line.startswith("+"): 1158 files["mapper"][mapper] = [tempref, temparg] 1159 tempref = [] 1160 temparg = [] 1161 1162 1163 fobj.close() 1164 return(files)
1165
1166 -def GetCompletePath(path):
1167 return(path.replace("~",os.path.expanduser("~")))
1168
1169 -def initOutFiles(controlDic, mapper, outpath):
1170 """ 1171 Since we append in the program, we need to make sure no old files remain... 1172 1173 @type controlDic: dictionary 1174 @param controlDic: dictionary containing future filenames. 1175 @type mapper: string 1176 @param mapper: current identifier mapper for which the results are written 1177 @type outpath: string 1178 @param outpath: existing path, where the outfiles will be written. 1179 """ 1180 for artificial in controlDic["mapper"][mapper][1]: 1181 filename = artificial.split("/")[-1] 1182 outfile = mapper + "_" + artificial.split("/")[-1][:-4] + ".esam" 1183 #easy Sam output 1184 fobj = open(outpath + outfile, "w") 1185 fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNumberOfMismatches\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n") 1186 fobj.close()
1187