1 /**
2     Some functions to work with FASTA data.
3 
4     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dentist.util.fasta;
10 
11 import std.algorithm : count, equal, joiner, startsWith;
12 import std.array : appender, array;
13 import std.conv : to;
14 import std.format : format, formattedRead;
15 import std.range :
16     back,
17     chain,
18     chunks,
19     drop,
20     ElementType,
21     empty,
22     front,
23     isBidirectionalRange,
24     only,
25     popBack,
26     popFront,
27     take,
28     walkLength;
29 import std..string : indexOf, lineSplitter, outdent;
30 import std.traits : isSomeChar, isSomeString;
31 import std.typecons : tuple, Tuple;
32 
33 /**
34     Gives access to FASTA data. Does not copy the input sequence.
35 */
36 template Fasta(T) if (isSomeString!T)
37 {
38     struct Fasta
39     {
40         static enum headerIndicator = '>';
41         const T data;
42         private size_t[] recordIndex;
43 
44         alias data this;
45 
46         /**
47             Build an index in order to give fast access to individual records.
48             This is called implicitly when accessing individual records using
49             `opIndex` or `length`.
50         */
51         void buildIndex()
52         {
53             if ((data.length > 0 && recordIndex.length > 0) || (data.length == 0
54                     && recordIndex.length == 0))
55                 return;
56 
57             recordIndex.reserve(data.count(headerIndicator) + 1);
58             long currIdx = data.indexOf(headerIndicator);
59 
60             while (currIdx >= 0)
61             {
62                 recordIndex ~= currIdx.to!size_t;
63                 currIdx = data.indexOf(headerIndicator, currIdx + 1);
64             }
65 
66             recordIndex ~= data.length;
67         }
68 
69         /**
70             Get the FASTA record at idx (zero-based).
71 
72             Returns: `FastaRecord!T` at index idx.
73         */
74         FastaRecord!T opIndex(size_t idx)
75         {
76             assert(0 <= idx && idx < length, "index out of bounds");
77             buildIndex();
78             auto recordBegin = recordIndex[idx];
79             auto recordEnd = recordIndex[idx + 1];
80 
81             return data[recordBegin .. recordEnd].parseFastaRecord();
82         }
83 
84         /// Get the number of FASTA records.
85         @property size_t length()
86         {
87             buildIndex();
88 
89             return recordIndex.length - 1;
90         }
91 
92         /// Returns true iff line starts with '>'.
93         static bool isHeaderLine(in T line) pure
94         {
95             return line.startsWith(only(headerIndicator));
96         }
97     }
98 }
99 
100 ///
101 unittest
102 {
103     auto fasta1 = Fasta!string(q"EOF
104         >sequence1
105         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
106         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
107         >sequence2
108         AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG
109         TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC
110 EOF".outdent);
111     auto fasta1Records = [
112         q"EOF
113             >sequence1
114             CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
115             AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
116 EOF".outdent.parseFastaRecord,
117         q"EOF
118             >sequence2
119             AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG
120             TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC
121 EOF".outdent.parseFastaRecord,
122     ];
123 
124     assert(fasta1.length == 2, fasta1.length.to!string);
125     assert(fasta1[0] == fasta1Records[0]);
126     assert(fasta1[1] == fasta1Records[1]);
127 }
128 
129 /// Convenience wrapper around `Fasta!T(T data)`.
130 Fasta!T parseFasta(T)(T data)
131 {
132     return typeof(return)(data);
133 }
134 
135 ///
136 unittest
137 {
138     string fastaData = q"EOF
139         >sequence1
140         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
141         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
142         >sequence2
143         AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG
144         TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC
145 EOF".outdent;
146     auto fasta = fastaData.parseFasta();
147     auto fastaRecords = [
148         FastaRecord!string(q"EOF
149             >sequence1
150             CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
151             AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
152 EOF".outdent),
153         FastaRecord!string(q"EOF
154             >sequence2
155             AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG
156             TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC
157 EOF".outdent),
158     ];
159 
160     assert(fasta.length == 2, fasta.length.to!string);
161     assert(fasta[0] == fastaRecords[0]);
162     assert(fasta[1] == fastaRecords[1]);
163 }
164 
165 /**
166     Gives access to a single FASTA record. Does not copy the input sequence.
167 */
168 template FastaRecord(T) if (isSomeString!T)
169 {
170     struct FastaRecord
171     {
172         static enum lineSep = "\n";
173         alias Slice = Tuple!(int, int);
174 
175         const T data;
176 
177         alias data this;
178 
179         /// Get this record in FASTA format.
180         auto toFasta(in size_t lineWidth = 50) pure const
181         {
182             auto formattedBody = this[].chunks(lineWidth).joiner(lineSep);
183 
184             return chain(header, lineSep, formattedBody, lineSep);
185         }
186 
187         /// Get the complete header line.
188         @property auto header() pure const
189         {
190             return data.lineSplitter.front;
191         }
192 
193         /// Get the length of the sequence (in characters).
194         @property size_t length() pure const
195         {
196             return this[].walkLength;
197         }
198 
199         @property size_t opDollar(size_t dim : 0)()
200         {
201             return length;
202         }
203 
204         /// Get the sequence of this FASTA record without newlines.
205         auto opIndex() pure const
206         {
207             return data.lineSplitter.drop(1).joiner;
208         }
209 
210         /// Get the sequence character at index `i` of this FASTA record.
211         auto opIndex(int i) pure const
212         {
213             i = normalizeIndex(i);
214             assert(0 <= i && i < length,
215                     format!"index out of bounds: %d not in [-%d, %d)"(i, length, length));
216 
217             return this[].drop(i).front;
218         }
219 
220         auto opIndex(in Slice slice) pure const
221         {
222             auto i = normalizeIndex(slice[0]);
223             auto j = normalizeIndex(slice[1]);
224             assert(0 <= i && i <= j && j <= length,
225                     format!"index out of bounds: [%d, %d) not in [-%d, %d)"(i, j, length, length));
226 
227             return this[].drop(i).take(j - i);
228         }
229 
230         /// Get sub-sequence from `i` to `j` (exclusive) of this FASTA record.
231         auto opSlice(size_t dim : 0)(int i, int j)
232         {
233             return tuple(i, j);
234         }
235 
236         /// ditto
237         auto opSlice(size_t dim : 0)(size_t i, size_t j)
238         {
239             return tuple(i.to!int, j.to!int);
240         }
241 
242         private int normalizeIndex(int i) const
243         {
244             auto length = this.length;
245 
246             while (i < 0)
247                 i += length;
248 
249             return i;
250         }
251     }
252 }
253 
254 ///
255 unittest
256 {
257     auto fastaRecord1 = q"EOF
258         >sequence1
259         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
260         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
261 EOF".outdent.parseFastaRecord;
262 
263     assert(fastaRecord1.header == ">sequence1");
264     assert(fastaRecord1[].equal("CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC"));
265     assert(fastaRecord1[0 .. 5].equal("CTAAC"));
266     assert(fastaRecord1.toFasta(13).equal(q"EOF
267         >sequence1
268         CTAACCCTAACCC
269         TAACCCTAACCCT
270         AACCCTAACCCTA
271         ACCCTAACCCTAA
272         CCCTAACCCTAAC
273         CCTAACCCTAACC
274         CTAACAACCCTAA
275         CCCTAACCC
276 EOF".outdent));
277 
278     auto fastaRecord2 = q"EOF
279         >sequence2
280         AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTG
281         TAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC
282 EOF".outdent.parseFastaRecord;
283 
284     assert(fastaRecord2.header == ">sequence2");
285     assert(fastaRecord2[].equal("AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTGTATTGTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACAACCCTAACC"));
286     assert(fastaRecord2[5 .. 10].equal("GAGCA"));
287     assert(fastaRecord2.toFasta(45).equal(q"EOF
288         >sequence2
289         AAGCTGAGCAGGGCTTTAAAGCTATCTTATTAATAATTATTTCTG
290         TATTGTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAAC
291         AACCCTAACC
292 EOF".outdent));
293 }
294 
295 /// Convenience wrapper around `FastaRecord!T(T data)`.
296 FastaRecord!T parseFastaRecord(T)(T data)
297 {
298     return typeof(return)(data);
299 }
300 
301 ///
302 unittest
303 {
304     string fastaRecordData = q"EOF
305         >sequence1
306         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
307         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
308 EOF".outdent;
309     auto fastaRecord = fastaRecordData.parseFastaRecord();
310 
311     assert(fastaRecord.header == ">sequence1");
312     assert(fastaRecord[].equal("CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC"));
313     assert(fastaRecord[0 .. 5].equal("CTAAC"));
314 }
315 
316 template PacBioHeader(T) if (isSomeString!T)
317 {
318     struct PacBioHeader
319     {
320         static enum headerFormat = ">%s/%d/%d_%d %s";
321 
322         T name;
323         size_t well;
324         size_t qualityRegionBegin;
325         size_t qualityRegionEnd;
326         string additionalInformation;
327 
328         /// Construct a `PacBioHeader!T` from `header`.
329         this(T header)
330         {
331             this.parse(header);
332         }
333 
334         /// Assign new `header` data.
335         void opAssign(T header)
336         {
337             this.parse(header);
338         }
339 
340         /// Builds the header string.
341         S to(S : T)() const
342         {
343             return buildHeader();
344         }
345 
346         private T buildHeader() const
347         {
348             return format!headerFormat(
349                 name,
350                 well,
351                 qualityRegionBegin,
352                 qualityRegionEnd,
353                 additionalInformation,
354             );
355         }
356 
357         private void parse(in T header)
358         {
359             auto numMatches = header[].formattedRead!headerFormat(
360                 name,
361                 well,
362                 qualityRegionBegin,
363                 qualityRegionEnd,
364                 additionalInformation,
365             );
366 
367             assert(numMatches == 5);
368         }
369     }
370 }
371 
372 ///
373 unittest
374 {
375     string header = ">name/1/0_1337 RQ=0.75";
376     auto pbHeader1 = PacBioHeader!string(header);
377 
378     assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75");
379     assert(pbHeader1.name == "name");
380     assert(pbHeader1.well == 1);
381     assert(pbHeader1.qualityRegionBegin == 0);
382     assert(pbHeader1.qualityRegionEnd == 1337);
383     assert(pbHeader1.additionalInformation == "RQ=0.75");
384 
385     PacBioHeader!string pbHeader2 = header;
386 
387     assert(pbHeader2 == pbHeader1);
388 }
389 
390 /// Convenience wrapper around `PacBioHeader!T(T header)`.
391 PacBioHeader!T parsePacBioHeader(T)(T header)
392 {
393     return typeof(return)(header);
394 }
395 
396 ///
397 unittest
398 {
399     string header = ">name/1/0_1337 RQ=0.75";
400     auto pbHeader1 = header.parsePacBioHeader();
401 
402     assert(pbHeader1.to!string == ">name/1/0_1337 RQ=0.75");
403     assert(pbHeader1.name == "name");
404     assert(pbHeader1.well == 1);
405     assert(pbHeader1.qualityRegionBegin == 0);
406     assert(pbHeader1.qualityRegionEnd == 1337);
407     assert(pbHeader1.additionalInformation == "RQ=0.75");
408 }
409 
410 /**
411     Get the complement of a DNA base. Only bases A, T, C, G will be translated;
412     all other characters are left as is. Replacement preserves casing of
413     the characters.
414 */
415 C complement(C)(C base) if (isSomeChar!C)
416 {
417     import std.range : zip;
418 
419     enum from = `AGTCagtc`;
420     enum to = `TCAGtcag`;
421 
422     switch (base)
423     {
424         static foreach (conv; zip(from, to))
425         {
426             case conv[0]:
427                 return conv[1];
428         }
429         default:
430             return base;
431     }
432 }
433 
434 /**
435     Compute the reverse complement of a DNA sequence. Only bases A, T, C, G
436     will be translated; all other characters are left as is. Replacement
437     preserves casing of the characters.
438 */
439 auto reverseComplementer(Range)(Range sequence)
440         if (isBidirectionalRange!Range && isSomeChar!(ElementType!Range))
441 {
442     import std.algorithm : map;
443     import std.range : retro;
444 
445     return sequence
446         .retro
447         .map!complement;
448 }
449 
450 /// ditto
451 T reverseComplement(T)(in T sequence) if (isSomeString!T)
452 {
453     import std.array : array;
454 
455     return sequence[].reverseComplementer.array.to!T;
456 }
457 
458 FastaRecord!T reverseComplement(T)(in FastaRecord!T fastaRecord) if (isSomeString!T)
459 {
460     enum lineSep = FastaRecord!T.lineSep;
461     auto header = fastaRecord.header;
462     auto sequence = fastaRecord[].array.reverseComplement;
463     auto builder = appender!T;
464 
465     builder.reserve(header.length + sequence.length + 2 * lineSep.length);
466 
467     builder ~= header;
468     builder ~= lineSep;
469     builder ~= sequence;
470     builder ~= lineSep;
471 
472     return typeof(return)(builder.data);
473 }
474 
475 ///
476 unittest
477 {
478     auto seq = "GGTTGTAAATTGACTGTTGTCTGCT\ngccaatctactggtgggggagagat";
479     auto revComp = "atctctcccccaccagtagattggc\nAGCAGACAACAGTCAATTTACAACC";
480 
481     assert(seq.reverseComplement == revComp);
482     assert(seq.reverseComplementer.equal(revComp));
483 
484     auto fastaRecord1 = q"EOF
485         >sequence1
486         CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT
487         AACCCTAACCCTAACCCTAACCCTAACCCTAACAACCCTAACCCTAACCC
488 EOF".outdent.parseFastaRecord;
489     auto fastaRecord1RevComp = fastaRecord1.reverseComplement;
490 
491     assert(fastaRecord1RevComp.header == ">sequence1");
492     assert(fastaRecord1RevComp[].equal("GGGTTAGGGTTAGGGTTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG"));
493     assert(fastaRecord1RevComp[0 .. 5].equal("GGGTT"));
494     assert(fastaRecord1RevComp.toFasta(13).equal(q"EOF
495         >sequence1
496         GGGTTAGGGTTAG
497         GGTTGTTAGGGTT
498         AGGGTTAGGGTTA
499         GGGTTAGGGTTAG
500         GGTTAGGGTTAGG
501         GTTAGGGTTAGGG
502         TTAGGGTTAGGGT
503         TAGGGTTAG
504 EOF".outdent));
505 }