1 /**
2     This package contains methods for cropping a pile up.
3 
4     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dentist.commands.processPileUps.cropper;
10 
11 import dentist.common :
12     ReadInterval,
13     ReferenceRegion,
14     ReferencePoint,
15     to;
16 import dentist.common.alignments :
17     AlignmentChain,
18     AlignmentLocationSeed,
19     coord_t,
20     getType,
21     PileUp,
22     ReadAlignment,
23     SeededAlignment,
24     trace_point_t;
25 import dentist.dazzler : buildDamFile, getFastaSequences;
26 import dentist.util.algorithm : sliceBy;
27 import dentist.util.log;
28 import dentist.util.math : ceil, ceildiv, RoundingMode;
29 import dentist.util.region : min, sup;
30 import std.algorithm :
31     all,
32     canFind,
33     copy,
34     joiner,
35     filter,
36     fold,
37     map,
38     sort,
39     sum,
40     swap;
41 import std.array : array, minimallyInitializedArray;
42 import std.conv : to;
43 import std.format : format;
44 import std.range :
45     chain,
46     chunks,
47     iota,
48     retro,
49     zip;
50 import std.typecons : tuple;
51 import vibe.data.json : toJson = serializeToJson;
52 
53 
54 struct CropOptions
55 {
56     string readsDb;
57     trace_point_t tracePointDistance;
58     string workdir;
59 }
60 
61 auto cropPileUp(PileUp pileUp, in ReferenceRegion mask, in CropOptions options)
62 {
63     auto cropper = PileUpCropper(pileUp, mask, options);
64     cropper.buildDb();
65 
66     return cropper.result;
67 }
68 
69 private struct PileUpCropper
70 {
71     PileUp pileUp;
72     const(ReferenceRegion) repeatMask;
73     const(CropOptions) options;
74     private ReferencePoint[] croppingRefPositions;
75     private string croppedDb;
76 
77     @property auto result()
78     {
79         return tuple!("db", "referencePositions")(croppedDb, croppingRefPositions);
80     }
81 
82     void buildDb()
83     {
84         fetchCroppingRefPositions();
85         auto croppedSequences = pileUpWithSequence().map!(t => getCroppedSequence(t.expand));
86         croppedDb = buildDamFile(croppedSequences, options.workdir);
87     }
88 
89     private void fetchCroppingRefPositions()
90     {
91         croppingRefPositions = getCroppingRefPositions();
92         logJsonDebug("croppingRefPositions", croppingRefPositions.toJson);
93 
94         foreach (refPos; croppingRefPositions)
95         {
96             if (refPos.value == -1)
97             {
98                 if (shouldLog(LogLevel.diagnostic))
99                 {
100                     auto involvedContigs = croppingRefPositions.map!"a.contigId".array;
101                     auto localRepeatMask = repeatMask
102                         .intervals
103                         .filter!(i => involvedContigs.canFind(i.contigId))
104                         .array;
105                     logJsonDiagnostic(
106                         "info", "could not find a common trace point",
107                         "croppingRefPositions", croppingRefPositions.toJson,
108                         "tracePointDistance", options.tracePointDistance,
109                         "pileUp", [
110                             "type": pileUp.getType.to!string.toJson,
111                             "readAlignments": shouldLog(LogLevel.debug_)
112                                 ? pileUp.map!"a[]".array.toJson
113                                 : toJson(null),
114                         ].toJson,
115                         "repeatMask", localRepeatMask.toJson,
116                     );
117                 }
118 
119                 throw new Exception("could not find a common trace point");
120             }
121         }
122     }
123 
124     private auto pileUpWithSequence()
125     {
126         auto readIds = pileUp.map!"a[0].contigB.id + 0".array;
127 
128         return zip(
129             iota(pileUp.length),
130             pileUp,
131             getFastaSequences(options.readsDb, readIds, options.workdir),
132         );
133     }
134 
135     /**
136         Get points on the reference where the pileUp should be cropped. Returns
137         one common trace point for each involved contig.
138 
139         See_Also: `getCommonTracePoint`
140     */
141     private ReferencePoint[] getCroppingRefPositions()
142     {
143         auto alignmentsByContig = pileUp.splitAlignmentsByContigA();
144         auto commonTracePoints = alignmentsByContig
145             .map!(acs => getCommonTracePoint(acs, repeatMask, options.tracePointDistance));
146         auto contigAIds = alignmentsByContig.map!"a[0].contigA.id";
147 
148         auto cropPos = zip(contigAIds, commonTracePoints)
149             .map!(zipped => ReferencePoint(zipped.expand))
150             .array;
151 
152         debug logJsonDebug(
153             "alignmentsByContig", alignmentsByContig.toJson,
154             "pileUp", [
155                 "type": pileUp.getType.to!string.toJson,
156                 "readAlignments": pileUp.map!"a[]".array.toJson,
157             ],
158             "cropPos", cropPos.toJson,
159         );
160 
161         return cropPos;
162     }
163 
164     private string getCroppedSequence(
165         in size_t croppedDbIdx,
166         in ReadAlignment readAlignment,
167         in string readSequence,
168     )
169     {
170         auto readCroppingSlice = getReadCroppingSlice(readAlignment);
171         debug logJsonDebug(
172             "croppedDbIdx", croppedDbIdx,
173             "readCroppingSlice", readCroppingSlice.toJson,
174         );
175 
176         return getCroppedReadAsFasta(
177             readSequence,
178             readCroppingSlice,
179         );
180     }
181 
182     private auto getReadCroppingSlice(in ReadAlignment readAlignment)
183     {
184         auto readCroppingSlice = readAlignment[]
185             .map!(alignment => alignment.getCroppingSlice(croppingRefPositions))
186             .fold!"a & b";
187         assert(!readCroppingSlice.empty, "invalid/empty read cropping slice");
188 
189         return readCroppingSlice;
190     }
191 
192     private string getCroppedReadAsFasta(string readSequence, in ReadInterval readCroppingSlice)
193     {
194         static enum fastaLineWidth = 100;
195         auto fastaHeader = format!">read-%d"(readCroppingSlice.readId);
196         auto fastaLength =
197             fastaHeader.length + 1 +                          // header line + line break
198             readCroppingSlice.size +                          // sequence + ...
199             ceildiv(readCroppingSlice.size, fastaLineWidth);  // line breaks for sequence +
200                                                               // new line at the end of file
201         auto croppedFasta = minimallyInitializedArray!(ubyte[])(fastaLength);
202         chain(
203             fastaHeader[],
204             "\n",
205             readSequence[readCroppingSlice.begin .. readCroppingSlice.end]
206                 .chunks(fastaLineWidth)
207                 .joiner("\n"),
208             "\n",
209         ).copy(croppedFasta);
210 
211         return cast(string) croppedFasta;
212     }
213 }
214 
215 /// Sort alignment chains of pileUp into groups with the same `contigA.id`.
216 private SeededAlignment[][] splitAlignmentsByContigA(PileUp pileUp)
217 {
218     if (pileUp.length == 0)
219     {
220         return [];
221     }
222 
223     auto orderedAlignments = pileUp
224         .map!"a[]"
225         .joiner
226         .array
227         .sort
228         .release;
229     auto alignmentsByContig = orderedAlignments.sliceBy!"a.contigA.id == b.contigA.id";
230 
231     return array(alignmentsByContig);
232 }
233 
234 /// Returns a common trace points wrt. contigA that is not in mask and is on
235 /// the relevant half of the contig.
236 private long getCommonTracePoint(
237     in SeededAlignment[] alignments,
238     in ReferenceRegion mask,
239     in size_t tracePointDistance
240 )
241 {
242     static long getCommonTracePoint(R)(R tracePointCandidates, ReferenceRegion allowedTracePointRegion) pure
243     {
244         auto commonTracePoints = tracePointCandidates.filter!(c => c in allowedTracePointRegion);
245 
246         return commonTracePoints.empty ? -1 : commonTracePoints.front.value.to!long;
247     }
248 
249     auto contigA = alignments[0].contigA;
250     auto locationSeed = alignments[0].seed;
251     auto commonAlignmentRegion = alignments
252         .map!(to!(ReferenceRegion, "contigA"))
253         .fold!"a & b";
254     auto allowedTracePointRegion = commonAlignmentRegion - mask;
255 
256     assert(alignments.all!(a => a.contigA == contigA && a.seed == locationSeed));
257     debug logJsonDebug(
258         "alignments", alignments.toJson,
259         "commonAlignmentRegion", commonAlignmentRegion.toJson,
260         "allowedTracePointRegion", allowedTracePointRegion.toJson,
261     );
262 
263     if (allowedTracePointRegion.empty)
264     {
265         return -1;
266     }
267 
268     auto tracePointMin = ceil(min(allowedTracePointRegion), tracePointDistance);
269     auto tracePointSup = ceil(sup(allowedTracePointRegion), tracePointDistance);
270     auto tracePointCandidates = iota(tracePointMin, tracePointSup, tracePointDistance)
271         .map!(tracePointCandidate => ReferencePoint(contigA.id, tracePointCandidate));
272 
273     return locationSeed == AlignmentLocationSeed.front
274         ? getCommonTracePoint(tracePointCandidates.retro, allowedTracePointRegion)
275         : getCommonTracePoint(tracePointCandidates, allowedTracePointRegion);
276 }
277 
278 private ReadInterval getCroppingSlice(
279     in SeededAlignment alignment,
280     in ReferencePoint[] croppingRefPoints,
281 )
282 {
283     auto locationSeed = alignment.seed;
284     auto read = alignment.contigB;
285     auto tracePointDistance = alignment.tracePointDistance;
286     auto croppingRefPos = croppingRefPoints
287         .filter!(p => p.contigId == alignment.contigA.id)
288         .front
289         .value;
290 
291     auto readCroppingPos = alignment
292         .translateTracePoint(cast(coord_t) croppingRefPos, RoundingMode.floor)
293         .contigB;
294     size_t readBeginIdx;
295     size_t readEndIdx;
296 
297     final switch (locationSeed)
298     {
299     case AlignmentLocationSeed.front:
300         readBeginIdx = 0;
301         readEndIdx = readCroppingPos;
302         break;
303     case AlignmentLocationSeed.back:
304         readBeginIdx = readCroppingPos;
305         readEndIdx = read.length;
306         break;
307     }
308 
309     if (alignment.complement)
310     {
311         swap(readBeginIdx, readEndIdx);
312         readBeginIdx = read.length - readBeginIdx;
313         readEndIdx = read.length - readEndIdx;
314     }
315 
316     auto croppingSlice = ReadInterval(read.id, readBeginIdx, readEndIdx);
317 
318     debug logJsonDebug(
319         "alignment", alignment.toJson,
320         "croppingRefPoints", croppingRefPoints.toJson,
321         "croppingSlice", croppingSlice.toJson,
322     );
323 
324     return croppingSlice;
325 }
326 
327 unittest
328 {
329     alias Contig = AlignmentChain.Contig;
330     alias Flags = AlignmentChain.Flags;
331     alias emptyFlags = AlignmentChain.emptyFlags;
332     alias complement = AlignmentChain.Flag.complement;
333     alias LocalAlignment = AlignmentChain.LocalAlignment;
334     alias Locus = LocalAlignment.Locus;
335     alias TracePoint = LocalAlignment.TracePoint;
336     enum tracePointDistance = 100;
337 
338     {
339         enum alignment = SeededAlignment(
340             AlignmentChain(
341                 0,
342                 Contig(1, 2584),
343                 Contig(58024, 10570),
344                 Flags(complement),
345                 [LocalAlignment(
346                     Locus(579, 2584),
347                     Locus(0, 2158),
348                     292,
349                     [
350                         TracePoint( 2,  23),
351                         TracePoint(11, 109),
352                         TracePoint(13, 109),
353                         TracePoint(15, 107),
354                         TracePoint(18, 107),
355                         TracePoint(14, 103),
356                         TracePoint(16, 106),
357                         TracePoint(17, 106),
358                         TracePoint( 9, 106),
359                         TracePoint(14, 112),
360                         TracePoint(16, 105),
361                         TracePoint(16, 114),
362                         TracePoint(10, 103),
363                         TracePoint(14, 110),
364                         TracePoint(15, 110),
365                         TracePoint(15, 101),
366                         TracePoint(17, 108),
367                         TracePoint(17, 109),
368                         TracePoint(15, 111),
369                         TracePoint(17, 111),
370                         TracePoint(11,  88),
371                     ],
372                 )],
373                 tracePointDistance,
374             ),
375             AlignmentLocationSeed.back,
376         );
377 
378         assert(
379             getCroppingSlice(alignment, [ReferencePoint(1, 600)]) ==
380             ReadInterval(58024, 0, 10547)
381         );
382         assert(
383             getCroppingSlice(alignment, [ReferencePoint(1, 800)]) ==
384             ReadInterval(58024, 0, 10329)
385         );
386     }
387     {
388         enum alignment = SeededAlignment(
389             AlignmentChain(
390                 0,
391                 Contig(1, 2584),
392                 Contig(7194, 9366),
393                 emptyFlags,
394                 [LocalAlignment(
395                     Locus(0, 724),
396                     Locus(8612, 9366),
397                     76,
398                     [
399                         TracePoint( 7, 107),
400                         TracePoint(17, 106),
401                         TracePoint(12,  96),
402                         TracePoint( 9, 107),
403                         TracePoint( 7, 102),
404                         TracePoint(11, 105),
405                         TracePoint(11, 105),
406                         TracePoint( 2,  26),
407                     ],
408                 )],
409                 tracePointDistance,
410             ),
411             AlignmentLocationSeed.front,
412         );
413 
414         assert(
415             getCroppingSlice(alignment, [ReferencePoint(1, 0)]) ==
416             ReadInterval(7194, 0, 8612)
417         );
418         assert(
419             getCroppingSlice(alignment, [ReferencePoint(1, 100)]) ==
420             ReadInterval(7194, 0, 8719)
421         );
422         assert(
423             getCroppingSlice(alignment, [ReferencePoint(1, 200)]) ==
424             ReadInterval(7194, 0, 8825)
425         );
426     }
427 }