1 /** 2 Some functions to work with FASTA data. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.util.fasta; 10 11 import std.algorithm : count, equal, joiner, startsWith; 12 import std.array : appender, array; 13 import std.conv : to; 14 import std.format : format, formattedRead; 15 import std.range : 16 back, 17 chain, 18 chunks, 19 drop, 20 ElementType, 21 empty, 22 front, 23 isBidirectionalRange, 24 only, 25 popBack, 26 popFront, 27 take, 28 walkLength; 29 import std..string : indexOf, lineSplitter, outdent; 30 import std.traits : isSomeChar, isSomeString; 31 import std.typecons : tuple, Tuple; 32 33 /** 34 Gives access to FASTA data. Does not copy the input sequence. 35 */ 36 template Fasta(T) if (isSomeString!T) 37 { 38 struct Fasta 39 { 40 static enum headerIndicator = '>'; 41 const T data; 42 private size_t[] recordIndex; 43 44 alias data this; 45 46 /** 47 Build an index in order to give fast access to individual records. 48 This is called implicitly when accessing individual records using 49 `opIndex` or `length`. 50 */ 51 void buildIndex() 52 { 53 if ((data.length > 0 && recordIndex.length > 0) || (data.length == 0 54 && recordIndex.length == 0)) 55 return; 56 57 recordIndex.reserve(data.count(headerIndicator) + 1); 58 long currIdx = data.indexOf(headerIndicator); 59 60 while (currIdx >= 0) 61 { 62 recordIndex ~= currIdx.to!size_t; 63 currIdx = data.indexOf(headerIndicator, currIdx + 1); 64 } 65 66 recordIndex ~= data.length; 67 } 68 69 /** 70 Get the FASTA record at idx (zero-based). 71 72 Returns: `FastaRecord!T` at index idx. 73 */ 74 FastaRecord!T opIndex(size_t idx) 75 { 76 assert(0 <= idx && idx < length, "index out of bounds"); 77 buildIndex(); 78 auto recordBegin = recordIndex[idx]; 79 auto recordEnd = recordIndex[idx + 1]; 80 81 return data[recordBegin .. recordEnd].parseFastaRecord(); 82 } 83 84 /// Get the number of FASTA records. 85 @property size_t length() 86 { 87 buildIndex(); 88 89 return recordIndex.length - 1; 90 } 91 92 /// Returns true iff line starts with '>'. 93 static bool isHeaderLine(in T line) pure 94 { 95 return line.startsWith(only(headerIndicator)); 96 } 97 } 98 } 99 100 /// 101 unittest 102 { 103 auto fasta1 = Fasta!string(q"EOF 104 >sequence1 105 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 106 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 107 >sequence2 108 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG 109 TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC 110 EOF".outdent); 111 auto fasta1Records = [ 112 q"EOF 113 >sequence1 114 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 115 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 116 EOF".outdent.parseFastaRecord, 117 q"EOF 118 >sequence2 119 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG 120 TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC 121 EOF".outdent.parseFastaRecord, 122 ]; 123 124 assert(fasta1.length == 2, fasta1.length.to!string); 125 assert(fasta1[0] == fasta1Records[0]); 126 assert(fasta1[1] == fasta1Records[1]); 127 } 128 129 /// Convenience wrapper around `Fasta!T(T data)`. 130 Fasta!T parseFasta(T)(T data) 131 { 132 return typeof(return)(data); 133 } 134 135 /// 136 unittest 137 { 138 string fastaData = q"EOF 139 >sequence1 140 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 141 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 142 >sequence2 143 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG 144 TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC 145 EOF".outdent; 146 auto fasta = fastaData.parseFasta(); 147 auto fastaRecords = [ 148 FastaRecord!string(q"EOF 149 >sequence1 150 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 151 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 152 EOF".outdent), 153 FastaRecord!string(q"EOF 154 >sequence2 155 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG 156 TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC 157 EOF".outdent), 158 ]; 159 160 assert(fasta.length == 2, fasta.length.to!string); 161 assert(fasta[0] == fastaRecords[0]); 162 assert(fasta[1] == fastaRecords[1]); 163 } 164 165 /** 166 Gives access to a single FASTA record. Does not copy the input sequence. 167 */ 168 template FastaRecord(T) if (isSomeString!T) 169 { 170 struct FastaRecord 171 { 172 static enum lineSep = "\n"; 173 alias Slice = Tuple!(int, int); 174 175 const T data; 176 177 alias data this; 178 179 /// Get this record in FASTA format. 180 auto toFasta(in size_t lineWidth = 50) pure const 181 { 182 auto formattedBody = this[].chunks(lineWidth).joiner(lineSep); 183 184 return chain(header, lineSep, formattedBody, lineSep); 185 } 186 187 /// Get the complete header line. 188 @property auto header() pure const 189 { 190 return data.lineSplitter.front; 191 } 192 193 /// Get the length of the sequence (in characters). 194 @property size_t length() pure const 195 { 196 return this[].walkLength; 197 } 198 199 @property size_t opDollar(size_t dim : 0)() 200 { 201 return length; 202 } 203 204 /// Get the sequence of this FASTA record without newlines. 205 auto opIndex() pure const 206 { 207 return data.lineSplitter.drop(1).joiner; 208 } 209 210 /// Get the sequence character at index `i` of this FASTA record. 211 auto opIndex(int i) pure const 212 { 213 i = normalizeIndex(i); 214 assert(0 <= i && i < length, 215 format!"index out of bounds: %d not in [-%d, %d)"(i, length, length)); 216 217 return this[].drop(i).front; 218 } 219 220 auto opIndex(in Slice slice) pure const 221 { 222 auto i = normalizeIndex(slice[0]); 223 auto j = normalizeIndex(slice[1]); 224 assert(0 <= i && i <= j && j <= length, 225 format!"index out of bounds: [%d, %d) not in [-%d, %d)"(i, j, length, length)); 226 227 return this[].drop(i).take(j - i); 228 } 229 230 /// Get sub-sequence from `i` to `j` (exclusive) of this FASTA record. 231 auto opSlice(size_t dim : 0)(int i, int j) 232 { 233 return tuple(i, j); 234 } 235 236 /// ditto 237 auto opSlice(size_t dim : 0)(size_t i, size_t j) 238 { 239 return tuple(i.to!int, j.to!int); 240 } 241 242 private int normalizeIndex(int i) const 243 { 244 auto length = this.length; 245 246 while (i < 0) 247 i += length; 248 249 return i; 250 } 251 } 252 } 253 254 /// 255 unittest 256 { 257 auto fastaRecord1 = q"EOF 258 >sequence1 259 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 260 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 261 EOF".outdent.parseFastaRecord; 262 263 assert(fastaRecord1.header == ">sequence1"); 264 assert(fastaRecord1[].equal("CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC")); 265 assert(fastaRecord1[0 .. 5].equal("CTAAC")); 266 assert(fastaRecord1.toFasta(13).equal(q"EOF 267 >sequence1 268 CTAACCCTAACCC 269 TAACCCTAACCCT 270 AACCCTAACCCTA 271 ACCCTAACCCTAA 272 CCCTAACCCTAAC 273 CCTAACCCTAACC 274 CTAACAACCCTAA 275 CCCTAACCC 276 EOF".outdent)); 277 278 auto fastaRecord2 = q"EOF 279 >sequence2 280 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG 281 TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC 282 EOF".outdent.parseFastaRecord; 283 284 assert(fastaRecord2.header == ">sequence2"); 285 assert(fastaRecord2[].equal("AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTGTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC")); 286 assert(fastaRecord2[5 .. 10].equal("GAGCA")); 287 assert(fastaRecord2.toFasta(45).equal(q"EOF 288 >sequence2 289 AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTG 290 TATTGTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAAC 291 AACCCTAACC 292 EOF".outdent)); 293 } 294 295 /// Convenience wrapper around `FastaRecord!T(T data)`. 296 FastaRecord!T parseFastaRecord(T)(T data) 297 { 298 return typeof(return)(data); 299 } 300 301 /// 302 unittest 303 { 304 string fastaRecordData = q"EOF 305 >sequence1 306 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 307 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 308 EOF".outdent; 309 auto fastaRecord = fastaRecordData.parseFastaRecord(); 310 311 assert(fastaRecord.header == ">sequence1"); 312 assert(fastaRecord[].equal("CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC")); 313 assert(fastaRecord[0 .. 5].equal("CTAAC")); 314 } 315 316 template PacBioHeader(T) if (isSomeString!T) 317 { 318 struct PacBioHeader 319 { 320 static enum headerFormat = ">%s/%d/%d_%d %s"; 321 322 T name; 323 size_t well; 324 size_t qualityRegionBegin; 325 size_t qualityRegionEnd; 326 string additionalInformation; 327 328 /// Construct a `PacBioHeader!T` from `header`. 329 this(T header) 330 { 331 this.parse(header); 332 } 333 334 /// Assign new `header` data. 335 void opAssign(T header) 336 { 337 this.parse(header); 338 } 339 340 /// Builds the header string. 341 S to(S : T)() const 342 { 343 return buildHeader(); 344 } 345 346 private T buildHeader() const 347 { 348 return format!headerFormat( 349 name, 350 well, 351 qualityRegionBegin, 352 qualityRegionEnd, 353 additionalInformation, 354 ); 355 } 356 357 private void parse(in T header) 358 { 359 auto numMatches = header[].formattedRead!headerFormat( 360 name, 361 well, 362 qualityRegionBegin, 363 qualityRegionEnd, 364 additionalInformation, 365 ); 366 367 assert(numMatches == 5); 368 } 369 } 370 } 371 372 /// 373 unittest 374 { 375 string header = ">name/1/0_1337 RQ=0.75"; 376 auto pbHeader1 = PacBioHeader!string(header); 377 378 assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75"); 379 assert(pbHeader1.name == "name"); 380 assert(pbHeader1.well == 1); 381 assert(pbHeader1.qualityRegionBegin == 0); 382 assert(pbHeader1.qualityRegionEnd == 1337); 383 assert(pbHeader1.additionalInformation == "RQ=0.75"); 384 385 PacBioHeader!string pbHeader2 = header; 386 387 assert(pbHeader2 == pbHeader1); 388 } 389 390 /// Convenience wrapper around `PacBioHeader!T(T header)`. 391 PacBioHeader!T parsePacBioHeader(T)(T header) 392 { 393 return typeof(return)(header); 394 } 395 396 /// 397 unittest 398 { 399 string header = ">name/1/0_1337 RQ=0.75"; 400 auto pbHeader1 = header.parsePacBioHeader(); 401 402 assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75"); 403 assert(pbHeader1.name == "name"); 404 assert(pbHeader1.well == 1); 405 assert(pbHeader1.qualityRegionBegin == 0); 406 assert(pbHeader1.qualityRegionEnd == 1337); 407 assert(pbHeader1.additionalInformation == "RQ=0.75"); 408 } 409 410 /** 411 Get the complement of a DNA base. Only bases A, T, C, G will be translated; 412 all other characters are left as is. Replacement preserves casing of 413 the characters. 414 */ 415 C complement(C)(C base) if (isSomeChar!C) 416 { 417 import std.range : zip; 418 419 enum from = `AGTCagtc`; 420 enum to = `TCAGtcag`; 421 422 switch (base) 423 { 424 static foreach (conv; zip(from, to)) 425 { 426 case conv[0]: 427 return conv[1]; 428 } 429 default: 430 return base; 431 } 432 } 433 434 /** 435 Compute the reverse complement of a DNA sequence. Only bases A, T, C, G 436 will be translated; all other characters are left as is. Replacement 437 preserves casing of the characters. 438 */ 439 auto reverseComplementer(Range)(Range sequence) 440 if (isBidirectionalRange!Range && isSomeChar!(ElementType!Range)) 441 { 442 import std.algorithm : map; 443 import std.range : retro; 444 445 return sequence 446 .retro 447 .map!complement; 448 } 449 450 /// ditto 451 T reverseComplement(T)(in T sequence) if (isSomeString!T) 452 { 453 import std.array : array; 454 455 return sequence[].reverseComplementer.array.to!T; 456 } 457 458 FastaRecord!T reverseComplement(T)(in FastaRecord!T fastaRecord) if (isSomeString!T) 459 { 460 enum lineSep = FastaRecord!T.lineSep; 461 auto header = fastaRecord.header; 462 auto sequence = fastaRecord[].array.reverseComplement; 463 auto builder = appender!T; 464 465 builder.reserve(header.length + sequence.length + 2 * lineSep.length); 466 467 builder ~= header; 468 builder ~= lineSep; 469 builder ~= sequence; 470 builder ~= lineSep; 471 472 return typeof(return)(builder.data); 473 } 474 475 /// 476 unittest 477 { 478 auto seq = "GGTTGTAAATTGACTGTTGTCTGCT\ngccaatctactggtgggggagagat"; 479 auto revComp = "atctctcccccaccagtagattggc\nAGCAGACAACAGTCAATTTACAACC"; 480 481 assert(seq.reverseComplement == revComp); 482 assert(seq.reverseComplementer.equal(revComp)); 483 484 auto fastaRecord1 = q"EOF 485 >sequence1 486 CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT 487 AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC 488 EOF".outdent.parseFastaRecord; 489 auto fastaRecord1RevComp = fastaRecord1.reverseComplement; 490 491 assert(fastaRecord1RevComp.header == ">sequence1"); 492 assert(fastaRecord1RevComp[].equal("GGGTTAGGGTTAGGGTTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG")); 493 assert(fastaRecord1RevComp[0 .. 5].equal("GGGTT")); 494 assert(fastaRecord1RevComp.toFasta(13).equal(q"EOF 495 >sequence1 496 GGGTTAGGGTTAG 497 GGTTGTTAGGGTT 498 AGGGTTAGGGTTA 499 GGGTTAGGGTTAG 500 GGTTAGGGTTAGG 501 GTTAGGGTTAGGG 502 TTAGGGTTAGGGT 503 TAGGGTTAG 504 EOF".outdent)); 505 }