dentist.commands.maskRepetitiveRegions source code

1 /**
2     This is the `maskRepetitiveRegions` command of `dentist`.
3 
4     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dentist.commands.maskRepetitiveRegions;
10 
11 import dentist.commandline : DentistCommand, OptionsFor;
12 import dentist.common :
13     ReferenceInterval,
14     ReferenceRegion;
15 import dentist.common.alignments :
16     AlignmentChain,
17     coord_t,
18     id_t;
19 import dentist.dazzler :
20     getAlignments,
21     writeMask;
22 import dentist.util.log;
23 import std.algorithm :
24     joiner,
25     map,
26     sort,
27     uniq;
28 import std.array : appender, array;
29 import std.format : format;
30 import std.range :
31     chain,
32     only;
33 import std.range.primitives :
34     ElementType,
35     isInputRange;
36 import std.typecons : Tuple;
37 import vibe.data.json : toJson = serializeToJson;
38 
39 /// Options for the `collectPileUps` command.
40 alias Options = OptionsFor!(DentistCommand.maskRepetitiveRegions);
41 
42 /// Execute the `collectPileUps` command with `options`.
43 void execute(in Options options)
44 {
45     auto assessor = new RepeatMaskAssessor(options);
46 
47     assessor.run();
48 }
49 
50 /// This class comprises the `collectPileUps` step of the `dentist` algorithm
51 class RepeatMaskAssessor
52 {
53     protected const Options options;
54     protected AlignmentChain[] selfAlignment;
55     protected AlignmentChain[] readsAlignment;
56     protected ReferenceRegion repetitiveRegions;
57 
58     this(in ref Options options)
59     {
60         this.options = options;
61     }
62 
63     void run()
64     {
65         mixin(traceExecution);
66 
67         readInputs();
68         assessRepeatStructure();
69         writeRepeatMask();
70     }
71 
72     protected void readInputs()
73     {
74         mixin(traceExecution);
75 
76         selfAlignment = getAlignments(
77             options.refDb,
78             options.selfAlignmentFile,
79             options.workdir,
80         );
81         readsAlignment = getAlignments(
82             options.refDb,
83             options.readsDb,
84             options.readsAlignmentFile,
85             options.workdir,
86         );
87 
88         if (selfAlignment.length == 0)
89             logJsonWarn("info", "empty self alignment");
90         if (readsAlignment.length == 0)
91             logJsonWarn("info", "empty ref vs. reads alignment");
92     }
93 
94     void assessRepeatStructure()
95     {
96         mixin(traceExecution);
97 
98         auto assessmentStages = [
99             assessmentStage(
100                 "self-alignment",
101                 new BadAlignmentCoverageAssessor(
102                     options.coverageBoundsSelf[0],
103                     options.coverageBoundsSelf[1]
104                 ),
105                 selfAlignment,
106             ),
107             assessmentStage(
108                 "reads-alignment",
109                 new BadAlignmentCoverageAssessor(
110                     options.coverageBoundsReads[0],
111                     options.coverageBoundsReads[1]
112                 ),
113                 readsAlignment,
114             ),
115         ];
116 
117         foreach (stage; assessmentStages)
118         {
119             auto repetitiveRegions = stage.assessor(stage.input);
120 
121             logJsonDiagnostic(
122                 "assessor", stage.name,
123                 "repetitiveRegions", shouldLog(LogLevel.debug_)
124                     ? repetitiveRegions.intervals.toJson
125                     : toJson(null),
126                 "numRepetitiveRegions", repetitiveRegions.intervals.length,
127             );
128 
129             if (shouldLog(LogLevel.debug_))
130             {
131                 auto maskName = format!"%s-%s"(options.repeatMask, stage.name);
132 
133                 writeMask(
134                     options.refDb,
135                     maskName,
136                     repetitiveRegions.intervals,
137                     options.workdir,
138                 );
139             }
140 
141             this.repetitiveRegions |= repetitiveRegions;
142         }
143 
144         logJsonDiagnostic(
145             "assessor", "finalResult",
146             "repetitiveRegions", shouldLog(LogLevel.debug_)
147                 ? this.repetitiveRegions.intervals.toJson
148                 : toJson(null),
149             "numRepetitiveRegions", this.repetitiveRegions.intervals.length,
150         );
151     }
152 
153     protected void writeRepeatMask()
154     {
155         mixin(traceExecution);
156 
157         writeMask(
158             options.refDb,
159             options.repeatMask,
160             repetitiveRegions.intervals,
161             options.workdir,
162         );
163     }
164 }
165 
166 // see `assessmentStage` below.
167 alias MaskAssessmentStage(Assessor : RepeatAssessor) = Tuple!(
168     string, "name",
169     Assessor, "assessor",
170     const(AlignmentChain[]), "input",
171 );
172 
173 /// Construct a stage of the repeat mask assessment pipeline.
174 MaskAssessmentStage!Assessor assessmentStage(Assessor : RepeatAssessor)(
175     string name,
176     Assessor assessor,
177     const(AlignmentChain[]) input,
178 )
179 {
180     return typeof(return)(name, assessor, input);
181 }
182 
183 /**
184     A `RepeatAssessor` should generate a repeat mask derived from a set of
185     alignment chains.
186 */
187 interface RepeatAssessor
188 {
189     ReferenceRegion opCall(in AlignmentChain[] input);
190 }
191 
192 /**
193     Mask reference regions where the alignment coverage is not within set
194     limits. This helps to identify repetitive or bad quality regions.
195 */
196 class BadAlignmentCoverageAssessor : RepeatAssessor
197 {
198     double lowerLimit;
199     double upperLimit;
200 
201     /// Create an assessor with these limits.
202     this(double lowerLimit, double upperLimit)
203     {
204         this.lowerLimit = lowerLimit;
205         this.upperLimit = upperLimit;
206     }
207 
208     version (unittest)
209     {
210         /**
211             Generate a list of alignment chains for testing:
212 
213             ```
214                                  c1                             c2                    c3
215                    0    5   10   15   20   25   30       0    5   10   15      0    5   10   15
216             ref:   [-----------------------------)       [--------------)      [--------------)
217             reads: .    .    .    .    .    .    .       .    .    .    .      .    .    .    .
218                    . #1 [------------) .    .    .   #12 [--) .    .    .  #23 .[--).    .    .
219                    . #2 [------------) .    .    .   #13 [--) .    .    .  #24 . [--)    .    .
220                    . #3 [--------------)    .    .   #14 [----)    .    .  #25 .  [--)   .    .
221                    .    . #4 [---------)    .    .   #15 [----)    .    .  #26 .   [--)  .    .
222                    .    . #5 [-------------------)   #16 [--------------)  #27 .    [--) .    .
223                    .    . #6 [-------------------)   #17 [--------------)  #28 .    .[--).    .
224                    .    .    #7 [----------------)   #18 [--------------)  #29 .    . [--)    .
225                    .    .    .    . #8 [---------)       .#19 [---------)  #30 .    .  [--)   .
226                    .    .    .    . #9 [---------)       .#20 [---------)  #31 .    .   [--)  .
227                    .    .    .    .#10 [---------)       .#21 [---------)  #32 .    .    [--) .
228                    .    .    .    .    #11 [-----)       .    #22 [-----)  #33 .    .    .[--).
229                    .    .    .    .    .    .    .       .    .    .    .      .    .    .    .
230             mask:  [====)    [=======) [=========)       [==) [=========)      [==) .    . [==)
231                    .    .    .    .    .    .    .       .    .    .    .      .    .    .    .
232             cov:   ^    .    .    .    .    .    .       .    .    .    .      .    .    .    .
233                    |    .    .    .    .    .    .       .    .    .    .      .    .    .    .
234                    |    .    .  +----+ .   +----+.       +-+  .   +----+.      .    .    .    .
235                    |    .    +--+####| +---+####|.       |#|  +---+####|.      .    .    .    .
236                  5 |.........|#######+-+########|.       |#+--+########|.      .    .    .    .
237                    |    .    |##################|.       |#############|.      .    .    .    .
238                    |    +----+##################|.       |#############|.      .  +-------+   .
239                    |    |#######################|.       |#############|.      . ++#######++  .
240                    |    |#######################|.       |#############|.      .++#########++ .
241                  0 +----+-----------------------+--------+-------------+--------+-----------+---->
242             ```
243         */
244         private static AlignmentChain[] getTestAlignments()
245         {
246             with (AlignmentChain) with (LocalAlignment)
247                 {
248                     id_t alignmentChainId = 0;
249                     id_t contReadId = 0;
250                     AlignmentChain getDummyAlignment(id_t contigId,
251                             coord_t contigLength, coord_t beginIdx, coord_t endIdx)
252                     {
253                         return AlignmentChain(
254                             ++alignmentChainId,
255                             Contig(contigId, contigLength),
256                             Contig(alignmentChainId, endIdx),
257                             emptyFlags,
258                             [
259                                 LocalAlignment(
260                                     Locus(beginIdx, beginIdx + 1),
261                                     Locus(0, 1),
262                                     0,
263                                 ),
264                                 LocalAlignment(
265                                     Locus(endIdx - 1, endIdx),
266                                     Locus(2, 3),
267                                     0,
268                                 ),
269                             ],
270                         );
271                     }
272 
273                     return [
274                         getDummyAlignment(1, 30,  5, 18), //  #1
275                         getDummyAlignment(1, 30,  5, 18), //  #2
276                         getDummyAlignment(1, 30,  5, 20), //  #3
277                         getDummyAlignment(1, 30, 10, 20), //  #4
278                         getDummyAlignment(1, 30, 10, 30), //  #5
279                         getDummyAlignment(1, 30, 10, 30), //  #6
280                         getDummyAlignment(1, 30, 13, 30), //  #7
281                         getDummyAlignment(1, 30, 20, 30), //  #8
282                         getDummyAlignment(1, 30, 20, 30), //  #9
283                         getDummyAlignment(1, 30, 20, 30), // #10
284                         getDummyAlignment(1, 30, 24, 30), // #11
285                         getDummyAlignment(2, 15,  0,  3), // #12
286                         getDummyAlignment(2, 15,  0,  3), // #13
287                         getDummyAlignment(2, 15,  0,  5), // #14
288                         getDummyAlignment(2, 15,  0,  5), // #15
289                         getDummyAlignment(2, 15,  0, 15), // #16
290                         getDummyAlignment(2, 15,  0, 15), // #17
291                         getDummyAlignment(2, 15,  0, 15), // #18
292                         getDummyAlignment(2, 15,  5, 15), // #19
293                         getDummyAlignment(2, 15,  5, 15), // #20
294                         getDummyAlignment(2, 15,  5, 15), // #21
295                         getDummyAlignment(2, 15,  9, 15), // #22
296                         getDummyAlignment(3, 15,  1,  4), // #23
297                         getDummyAlignment(3, 15,  2,  5), // #24
298                         getDummyAlignment(3, 15,  3,  6), // #25
299                         getDummyAlignment(3, 15,  4,  7), // #26
300                         getDummyAlignment(3, 15,  5,  8), // #27
301                         getDummyAlignment(3, 15,  6,  9), // #28
302                         getDummyAlignment(3, 15,  7, 10), // #29
303                         getDummyAlignment(3, 15,  8, 11), // #30
304                         getDummyAlignment(3, 15,  9, 12), // #31
305                         getDummyAlignment(3, 15, 10, 13), // #32
306                         getDummyAlignment(3, 15, 11, 14), // #33
307                     ];
308                 }
309         }
310     }
311 
312     /// Apply the assessor to the given set of alignment.
313     override ReferenceRegion opCall(const(AlignmentChain[]) alignments)
314     {
315         if (alignments.length == 0)
316         {
317             return ReferenceRegion();
318         }
319 
320         static enum OK = CoverageZone.ok;
321         auto maskAcc = appender!(ReferenceInterval[]);
322         auto masker = Masker();
323         auto changeEvents = coverageChanges(alignments);
324         auto lastEvent = changeEvents.front;
325 
326         debug logJsonDebug("changeEvents", changeEvents.array.toJson);
327         foreach (event; changeEvents)
328         {
329             auto currentZone = coverageZone(event.currentCoverage);
330             auto newZone = coverageZone(event.newCoverage);
331 
332             if (masker.isMasking && event.contigId != lastEvent.contigId)
333             {
334                 maskAcc ~= masker.finish(lastEvent.position);
335             }
336 
337             if (
338                 !masker.isMasking &&
339                 (newZone != OK || (currentZone == OK && currentZone != newZone))
340             )
341             {
342                 masker.start(event.contigId, event.position);
343             }
344             else if (masker.isMasking && currentZone != OK && newZone == OK)
345             {
346                 maskAcc ~= masker.finish(event.position);
347             }
348 
349             lastEvent = event;
350         }
351 
352         if (masker.isMasking)
353         {
354             maskAcc ~= masker.finish(lastEvent.position);
355         }
356 
357         return ReferenceRegion(maskAcc.data);
358     }
359 
360     ///
361     unittest
362     {
363         auto alignments = getTestAlignments();
364         alias CoverageChange = CoverageChangeRange.CoverageChange;
365 
366         auto assessor = new BadAlignmentCoverageAssessor(3, 5);
367 
368         assert(assessor(alignments) == ReferenceRegion([
369             ReferenceInterval(1,  0,  5),
370             ReferenceInterval(1, 10, 18),
371             ReferenceInterval(1, 20, 30),
372             ReferenceInterval(2,  0,  3),
373             ReferenceInterval(2,  5, 15),
374             ReferenceInterval(3,  0,  3),
375             ReferenceInterval(3, 12, 15),
376         ]));
377     }
378 
379     private CoverageZone coverageZone(in int coverage) pure nothrow
380     {
381         return coverage < lowerLimit
382             ? CoverageZone.low
383             : coverage > upperLimit
384                 ? CoverageZone.high
385                 : CoverageZone.ok;
386     }
387 }
388 
389 private enum CoverageZone
390 {
391     low,
392     ok,
393     high,
394 }
395 
396 private struct Masker
397 {
398     private bool _isMasking = false;
399     private size_t contigId;
400     private size_t maskStart;
401 
402     void start(in size_t contigId, in size_t maskStart) pure nothrow
403     {
404         this._isMasking = true;
405         this.contigId = contigId;
406         this.maskStart = maskStart;
407     }
408 
409     ReferenceInterval finish(in size_t maskEnd) pure nothrow
410     {
411         this._isMasking = false;
412 
413         return ReferenceInterval(
414             this.contigId,
415             this.maskStart,
416             maskEnd,
417         );
418     }
419 
420     @property bool isMasking() const pure nothrow
421     {
422         return this._isMasking;
423     }
424 }
425 
426 /// Transforms a range of alignment chains into a range of coverage
427 /// change events.
428 private struct CoverageChangeRange
429 {
430     static alias AlignmentEvent = Tuple!
431     (
432         size_t, "contigId",
433         size_t, "position",
434         int, "diff",
435     );
436 
437     static struct CoverageChange
438     {
439         size_t contigId;
440         size_t position;
441         int currentCoverage;
442         int newCoverage;
443 
444         bool hasChanged() const pure nothrow
445         {
446             return currentCoverage != newCoverage;
447         }
448     }
449 
450     AlignmentEvent[] alignmentEvents;
451     size_t currentEventIdx = 0;
452     CoverageChange _front;
453 
454     private this(AlignmentEvent[] alignmentEvents)
455     {
456         this.alignmentEvents = alignmentEvents;
457         if (!empty)
458         {
459             // Advance to first event.
460             popFront();
461         }
462     }
463 
464     private static CoverageChangeRange create(Range)(Range alignments)
465             if (isInputRange!Range && is(ElementType!Range : const(AlignmentChain)))
466     {
467         if (alignments.length == 0)
468         {
469             return CoverageChangeRange();
470         }
471 
472         auto alignmentEvents = alignments
473             .map!(alignment => only(
474                 AlignmentEvent(alignment.contigA.id, alignment.first.contigA.begin, 1),
475                 AlignmentEvent(alignment.contigA.id, alignment.last.contigA.end, -1),
476             ))
477             .joiner;
478         auto contigBoundaryEvents = alignments
479             .map!"a.contigA"
480             .uniq
481             .map!(contig => only(
482                 AlignmentEvent(contig.id, 0, 0),
483                 AlignmentEvent(contig.id, contig.length, 0),
484             ))
485             .joiner;
486         auto changeEvents = chain(alignmentEvents, contigBoundaryEvents).array;
487         changeEvents.sort();
488 
489         return CoverageChangeRange(changeEvents);
490     }
491 
492     void popFront()
493     {
494         _front.currentCoverage = _front.newCoverage;
495 
496         if (currentEventIdx == alignmentEvents.length)
497         {
498             // There is no more data; just pop the front and return;
499             assert(_front.currentCoverage == 0, "coverage should drop to zero in the end");
500             ++currentEventIdx;
501 
502             return;
503         }
504 
505         auto currentEvent = alignmentEvents[currentEventIdx];
506         auto eventAcc = currentEvent;
507         eventAcc.diff = 0;
508 
509         // Collect all events for one position.
510         for (; currentEventIdx < alignmentEvents.length; ++currentEventIdx)
511         {
512             currentEvent = alignmentEvents[currentEventIdx];
513 
514             if (
515                 eventAcc.contigId != currentEvent.contigId ||
516                 eventAcc.position != currentEvent.position
517             )
518             {
519                 break;
520             }
521 
522             eventAcc.diff += currentEvent.diff;
523         }
524 
525         assert(eventAcc.contigId == _front.contigId || _front.currentCoverage == 0,
526                 "coverage should drop to zero between contigs");
527         _front.contigId = eventAcc.contigId;
528         _front.position = eventAcc.position;
529         _front.newCoverage = _front.currentCoverage + eventAcc.diff;
530     }
531 
532     @property bool empty()
533     {
534         return currentEventIdx > alignmentEvents.length;
535     }
536 
537     @property CoverageChange front()
538     {
539         return _front;
540     }
541 }
542 
543 CoverageChangeRange coverageChanges(Range)(Range alignments)
544 {
545     return CoverageChangeRange.create(alignments);
546 }
547 
548 unittest
549 {
550     import std.algorithm : equal;
551 
552     auto alignments = BadAlignmentCoverageAssessor.getTestAlignments();
553 
554     alias CoverageChange = CoverageChangeRange.CoverageChange;
555 
556     assert(coverageChanges(alignments).equal([
557         CoverageChange(1,  0, 0, 0),
558         CoverageChange(1,  5, 0, 3),
559         CoverageChange(1, 10, 3, 6),
560         CoverageChange(1, 13, 6, 7),
561         CoverageChange(1, 18, 7, 5),
562         CoverageChange(1, 20, 5, 6),
563         CoverageChange(1, 24, 6, 7),
564         CoverageChange(1, 30, 7, 0),
565         CoverageChange(2,  0, 0, 7),
566         CoverageChange(2,  3, 7, 5),
567         CoverageChange(2,  5, 5, 6),
568         CoverageChange(2,  9, 6, 7),
569         CoverageChange(2, 15, 7, 0),
570         CoverageChange(3,  0, 0, 0),
571         CoverageChange(3,  1, 0, 1),
572         CoverageChange(3,  2, 1, 2),
573         CoverageChange(3,  3, 2, 3),
574         CoverageChange(3,  4, 3, 3),
575         CoverageChange(3,  5, 3, 3),
576         CoverageChange(3,  6, 3, 3),
577         CoverageChange(3,  7, 3, 3),
578         CoverageChange(3,  8, 3, 3),
579         CoverageChange(3,  9, 3, 3),
580         CoverageChange(3, 10, 3, 3),
581         CoverageChange(3, 11, 3, 3),
582         CoverageChange(3, 12, 3, 2),
583         CoverageChange(3, 13, 2, 1),
584         CoverageChange(3, 14, 1, 0),
585         CoverageChange(3, 15, 0, 0),
586     ]));
587 }