1 /**
2     This is the `output` command of `dentist`.
3 
4     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dentist.commands.output;
10 
11 import dentist.commandline : DentistCommand, OptionsFor;
12 import dentist.common : ReferenceInterval, ReferencePoint;
13 import dentist.common.alignments :
14     AlignmentChain,
15     AlignmentLocationSeed,
16     id_t,
17     trace_point_t;
18 import dentist.common.binio :
19     CompressedSequence,
20     InsertionDb;
21 import dentist.common.insertions :
22     getInfoForExistingContig,
23     getInfoForGap,
24     getInfoForNewSequenceInsertion,
25     Insertion,
26     InsertionInfo,
27     isOutputGap,
28     isValidInsertion,
29     OutputScaffold,
30     SpliceSite;
31 import dentist.common.scaffold :
32     ContigNode,
33     ContigPart,
34     scaffoldStarts,
35     enforceJoinPolicy,
36     getUnkownJoin,
37     initScaffold,
38     isAntiParallel,
39     isDefault,
40     isExtension,
41     isFrontExtension,
42     isGap,
43     linearWalk,
44     normalizeUnkownJoins,
45     removeExtensions;
46 import dentist.dazzler :
47     ContigSegment,
48     GapSegment,
49     getNumContigs,
50     getFastaSequence,
51     getScaffoldStructure,
52     ScaffoldSegment;
53 import dentist.util.algorithm : replaceInPlace;
54 import dentist.util.fasta : complement, reverseComplementer;
55 import dentist.util.log;
56 import dentist.util.math :
57     absdiff,
58     add,
59     bulkAdd,
60     ceil,
61     floor,
62     mean;
63 import dentist.util.range : wrapLines;
64 import std.algorithm :
65     all,
66     canFind,
67     copy,
68     count,
69     countUntil,
70     filter,
71     find,
72     fold,
73     joiner,
74     map,
75     maxElement,
76     swap,
77     swapAt;
78 import std.array : array;
79 import std.conv : to;
80 import std.format : format;
81 import std.range :
82     enumerate,
83     only,
84     repeat,
85     takeExactly,
86     walkLength;
87 import std.range.primitives : empty, front, popFront, save;
88 import std.stdio : File, stderr, stdout;
89 import std.typecons : Flag, No, tuple, Tuple, Yes;
90 import vibe.data.json : toJson = serializeToJson;
91 
92 
93 /// Options for the `output` command.
94 alias Options = OptionsFor!(DentistCommand.output);
95 
96 
97 /// Execute the `output` command with `options`.
98 void execute(Options)(in Options options)
99 {
100     auto writer = new AssemblyWriter(options);
101 
102     writer.run();
103 }
104 
105 class AssemblyWriter
106 {
107     alias FastaWriter = typeof(wrapLines(stdout.lockingTextWriter, 0));
108 
109     protected const Options options;
110     protected const(ScaffoldSegment)[] scaffoldStructure;
111     size_t numReferenceContigs;
112     size_t[] contigLengths;
113     OutputScaffold assemblyGraph;
114     OutputScaffold.IncidentEdgesCache incidentEdgesCache;
115     ContigNode[] scaffoldStartNodes;
116     File assemblyFile;
117     FastaWriter writer;
118 
119     this(in ref Options options)
120     {
121         this.options = options;
122         this.assemblyFile = options.assemblyFile is null
123             ? stdout
124             : File(options.assemblyFile, "w");
125         this.writer = wrapLines(assemblyFile.lockingTextWriter, options.fastaLineWidth);
126     }
127 
128     void run()
129     {
130         mixin(traceExecution);
131 
132         init();
133         buildAssemblyGraph();
134         scaffoldStartNodes = scaffoldStarts!InsertionInfo(assemblyGraph, incidentEdgesCache).array;
135         logStatistics();
136 
137         foreach (startNode; scaffoldStartNodes)
138             writeNewScaffold(startNode);
139     }
140 
141     protected void init()
142     {
143         mixin(traceExecution);
144 
145         numReferenceContigs = getNumContigs(options.refDb, options.workdir);
146         scaffoldStructure = getScaffoldStructure(options.refDb).array;
147         contigLengths = scaffoldStructure
148             .filter!(part => part.peek!ContigSegment !is null)
149             .map!(contigPart => contigPart.get!ContigSegment)
150             .map!(contigPart => contigPart.end - contigPart.begin + 0)
151             .array;
152     }
153 
154 
155     protected void buildAssemblyGraph()
156     {
157         mixin(traceExecution);
158 
159         auto insertionDb = InsertionDb.parse(options.insertionsFile);
160         auto insertions = insertionDb[];
161 
162         assemblyGraph = initScaffold!(
163             contigId => InsertionInfo(CompressedSequence(), contigLengths[contigId - 1], []),
164             InsertionInfo,
165         )(numReferenceContigs);
166         assemblyGraph.bulkAdd!(joins => mergeInsertions(joins))(insertions);
167         appendUnkownJoins();
168 
169         if (!options.shouldExtendContigs)
170         {
171             assemblyGraph = assemblyGraph.removeExtensions!InsertionInfo();
172         }
173 
174         assemblyGraph = assemblyGraph
175             .enforceJoinPolicy!InsertionInfo(options.joinPolicy)
176             .normalizeUnkownJoins!InsertionInfo()
177             .fixCropping(options.tracePointDistance);
178         incidentEdgesCache = assemblyGraph.allIncidentEdges();
179 
180         if (options.assemblyGraphFile !is null)
181             InsertionDb.write(options.assemblyGraphFile, assemblyGraph.edges);
182     }
183 
184 
185     protected void appendUnkownJoins()
186     {
187         auto unkownJoins = scaffoldStructure[]
188             .filter!(part => part.peek!GapSegment !is null)
189             .map!(gapPart => gapPart.get!GapSegment)
190             .map!(gapPart => getUnkownJoin(
191                 gapPart.beginGlobalContigId,
192                 gapPart.endGlobalContigId,
193                 InsertionInfo(CompressedSequence(), gapPart.length, []),
194             ));
195         assemblyGraph.bulkAddForce(unkownJoins);
196     }
197 
198     void logStatistics()
199     {
200         mixin(traceExecution);
201 
202         // Due to several transformations of the assembly graph the predicates
203         // for "join types" must be combined to yield the expected result.
204         alias _isSpannedGap = j => j.isGap && !(j.isOutputGap || j.isDefault);
205         alias _isExtension = j => !j.isGap && !(j.isOutputGap || j.isDefault);
206         alias _isRemaingGap = isOutputGap;
207         alias _isExistingContig = isDefault;
208 
209         logJsonInfo(
210             "numGraphEdges", assemblyGraph.edges.length,
211             "numScaffolds", scaffoldStartNodes.length,
212             "numSpannedGaps", assemblyGraph.edges.count!_isSpannedGap,
213             "numExtensions", assemblyGraph.edges.count!_isExtension,
214             "numRemainingGaps", assemblyGraph.edges.count!_isRemaingGap,
215             "numExistingContigs", assemblyGraph.edges.count!_isExistingContig,
216         );
217 
218         if (shouldLog(LogLevel.diagnostic))
219             logSparseInsertionWalks();
220 
221         debug logJsonDebug(
222             "insertionWalks", scaffoldStartNodes
223                 .map!(startNode => linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache)
224                     .map!(join => [
225                         "start": join.start.toJson,
226                         "end": join.end.toJson,
227                         "payload": [
228                             "sequence": join.payload.sequence.to!string.toJson,
229                             "contigLength": join.payload.contigLength.toJson,
230                             "spliceSites": join.payload.spliceSites.toJson,
231                         ].toJson,
232                     ])
233                     .array
234                 )
235                 .array
236                 .toJson
237         );
238     }
239 
240     void logSparseInsertionWalks()
241     {
242         stderr.write(`{"scaffolds":[`);
243 
244         foreach (i, startNode; scaffoldStartNodes)
245         {
246             if (i == 0)
247                 stderr.write(`[`);
248             else
249                 stderr.write(`,[`);
250 
251             foreach (enumJoin; enumerate(linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache)))
252             {
253                 auto j = enumJoin.index;
254                 auto join = enumJoin.value;
255 
256                 if (j > 0)
257                     stderr.write(`,`);
258 
259                 if (join.isDefault)
260                     stderr.writef!`{"action":"copy","contigId":%d}`(join.start.contigId);
261                 else if (join.isOutputGap)
262                     stderr.writef!`{"action":"gap","length":%d}`(join.payload.contigLength);
263                 else if (join.isGap)
264                     stderr.writef!`{"action":"insert","contigIds":[%d,%d],"length":%d}`(
265                         join.start.contigId,
266                         join.end.contigId,
267                         join.payload.sequence.length
268                     );
269                 else if (join.isExtension)
270                     stderr.writef!`{"action":"insert","contigIds":[%d],"contigPart":"%s","length":%d}`(
271                         join.start.contigId,
272                         join.isFrontExtension ? "front" : "back",
273                         join.payload.sequence.length
274                     );
275                 else
276                     assert(0, "unexpected join type");
277             }
278 
279             stderr.write(`]`);
280         }
281 
282         stderr.writeln(`]}`);
283     }
284 
285     void writeNewScaffold(ContigNode startNode)
286     {
287         mixin(traceExecution);
288 
289         auto globalComplement = false;
290         auto insertionBegin = startNode;
291 
292         logJsonDebug(
293             "info", "writing scaffold",
294             "scaffoldId", startNode.contigId,
295         );
296 
297         writeHeader(startNode);
298         foreach (currentInsertion; linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache))
299         {
300             writeInsertion(
301                 insertionBegin,
302                 currentInsertion,
303                 globalComplement,
304             );
305 
306             insertionBegin = currentInsertion.target(insertionBegin);
307             if (currentInsertion.isAntiParallel)
308                 globalComplement = !globalComplement;
309         }
310         "\n".copy(writer);
311     }
312 
313     Insertion mergeInsertions(Insertion[] insertionsChunk)
314     {
315         assert(insertionsChunk.length > 0);
316 
317         if (insertionsChunk[0].isDefault)
318         {
319             auto merged = insertionsChunk[0];
320 
321             merged.payload.contigLength = insertionsChunk
322                 .map!"a.payload.contigLength"
323                 .maxElement;
324             assert(insertionsChunk
325                 .all!(a => a.payload.contigLength == 0 ||
326                            a.payload.contigLength == merged.payload.contigLength));
327             merged.payload.spliceSites = insertionsChunk
328                 .map!"a.payload.spliceSites"
329                 .joiner
330                 .array;
331 
332             return merged;
333         }
334         else
335         {
336             assert(insertionsChunk.length == 1);
337 
338             return insertionsChunk[0];
339         }
340     }
341 
342     protected void writeHeader(in ContigNode begin)
343     {
344         format!">scaffold-%d\n"(begin.contigId).copy(writer);
345     }
346 
347     protected void writeInsertion(
348         in ContigNode begin,
349         in Insertion insertion,
350         in bool globalComplement,
351     )
352     {
353         assert(insertion.isValidInsertion, "invalid insertion");
354 
355         if (insertion.isDefault)
356             writeExistingContig(begin, insertion, globalComplement);
357         else if (insertion.isOutputGap)
358             writeGap(insertion);
359         else
360             writeNewSequenceInsertion(begin, insertion, globalComplement);
361     }
362 
363     protected void writeExistingContig(
364         in ContigNode begin,
365         in Insertion insertion,
366         in bool globalComplement,
367     )
368     {
369         auto insertionInfo = getInfoForExistingContig(begin, insertion, globalComplement);
370 
371         auto contigSequence = getFastaSequence(options.refDb, insertionInfo.contigId, options.workdir);
372         contigSequence = contigSequence[insertionInfo.spliceStart .. insertionInfo.spliceEnd];
373 
374         logJsonDebug(
375             "info", "writing contig insertion",
376             "contigId", insertionInfo.contigId,
377             "contigLength", insertion.payload.contigLength,
378             "spliceStart", insertionInfo.spliceStart,
379             "spliceEnd", insertionInfo.spliceEnd,
380             "spliceSites", insertion.payload.spliceSites.toJson,
381             "start", insertion.start.toJson,
382             "end", insertion.end.toJson,
383         );
384 
385         if (insertionInfo.complement)
386             contigSequence.reverseComplementer.copy(writer);
387         else
388             contigSequence.copy(writer);
389     }
390 
391     protected void writeGap(
392         in Insertion insertion,
393     )
394     {
395         auto insertionInfo = getInfoForGap(insertion);
396 
397         logJsonDebug(
398             "info", "writing gap",
399             "gapLength", insertionInfo.length,
400             "start", insertion.start.toJson,
401             "end", insertion.end.toJson,
402         );
403 
404         enum char unkownBase = 'n';
405         unkownBase
406             .repeat
407             .takeExactly(insertionInfo.length)
408             .copy(writer);
409     }
410 
411     protected void writeNewSequenceInsertion(
412         in ContigNode begin,
413         in Insertion insertion,
414         in bool globalComplement,
415     )
416     {
417         auto insertionInfo = getInfoForNewSequenceInsertion(begin, insertion, globalComplement);
418 
419         logJsonDebug(
420             "info", "writing new sequence insertion",
421             "type", insertion.isGap ? "gap" : "extension",
422             "insertionLength", insertion.payload.sequence.length,
423             "isAntiParallel", insertion.isAntiParallel,
424             "localComplement", insertion.payload.spliceSites[0].flags.complement,
425             "start", insertion.start.toJson,
426             "end", insertion.end.toJson,
427         );
428 
429         if (insertionInfo.complement)
430             insertionInfo.sequence.bases!(char, Yes.reverse).map!(complement!char).copy(writer);
431         else
432             insertionInfo.sequence.bases!char.copy(writer);
433 
434     }
435 }
436 
437 
438 /// Remove contig cropping where no new sequence is to be inserted and adjust
439 /// cropping where cropped regions overlap.
440 OutputScaffold fixCropping(
441     OutputScaffold scaffold,
442     trace_point_t tracePointDistance,
443 )
444 {
445     mixin(traceExecution);
446 
447     alias replace = OutputScaffold.ConflictStrategy.replace;
448     auto contigJoins = scaffold.edges.filter!isDefault;
449     auto incidentEdgesCache = scaffold.allIncidentEdges();
450 
451     foreach (contigJoin; contigJoins)
452     {
453         auto insertionUpdated1 = transferCroppingFromIncidentJoins(contigJoin, incidentEdgesCache);
454         auto insertionUpdated2 = resolveCroppingConflicts(scaffold, contigJoin, incidentEdgesCache,
455                                                          tracePointDistance);
456 
457         version (assert)
458         {
459             auto spliceSites = contigJoin.payload.spliceSites;
460 
461             assert(spliceSites.length <= 2);
462             if (spliceSites.length == 2)
463                 assert(
464                     spliceSites[0].croppingRefPosition.value == spliceSites[1].croppingRefPosition.value ||
465                     (spliceSites[0].croppingRefPosition.value < spliceSites[1].croppingRefPosition.value)
466                     ==
467                     (spliceSites[0].alignmentSeed < spliceSites[1].alignmentSeed)
468                 );
469         }
470 
471         if (insertionUpdated1 || insertionUpdated2)
472             scaffold.add!replace(contigJoin);
473     }
474 
475 
476     return scaffold;
477 }
478 
479 Flag!"insertionUpdated" transferCroppingFromIncidentJoins(
480     ref Insertion contigJoin,
481     OutputScaffold.IncidentEdgesCache incidentEdgesCache,
482 )
483 {
484     SpliceSite spliceSiteFromIncidentJoins(ContigNode contigNode)
485     {
486         alias isSomeInsertion = insertion =>
487             !insertion.isOutputGap && (insertion.isGap || insertion.isExtension);
488 
489         auto incidentJoins = incidentEdgesCache[contigNode].filter!isSomeInsertion;
490 
491         if (incidentJoins.empty)
492             return SpliceSite();
493 
494         assert(incidentJoins.walkLength(2) == 1, "non-linearized scaffold");
495         auto spliceSitesForContigNode = incidentJoins
496             .front
497             .payload
498             .spliceSites
499             .filter!(spliceSite => spliceSite.croppingRefPosition.contigId == contigNode.contigId);
500         assert(!spliceSitesForContigNode.empty, "missing splice site");
501         assert(spliceSitesForContigNode.walkLength(2) <= 1, "too many splice sites");
502 
503         return spliceSitesForContigNode.front;
504     }
505 
506     contigJoin.payload.spliceSites = only(contigJoin.start, contigJoin.end)
507         .map!spliceSiteFromIncidentJoins
508         .filter!(spliceSite => spliceSite != SpliceSite.init)
509         .array;
510 
511     return cast(typeof(return)) (contigJoin.payload.spliceSites.length > 0);
512 }
513 
514 Flag!"insertionUpdated" resolveCroppingConflicts(
515     ref OutputScaffold scaffold,
516     ref Insertion contigJoin,
517     OutputScaffold.IncidentEdgesCache incidentEdgesCache,
518     trace_point_t tracePointDistance,
519 )
520 {
521     auto spliceSites = contigJoin.payload.spliceSites;
522 
523     if (spliceSites.length <= 1)
524         return No.insertionUpdated;
525 
526     alias croppingRefPos = (spliceSite) => spliceSite.croppingRefPosition.value;
527     if (croppingRefPos(spliceSites[0]) <= croppingRefPos(spliceSites[1]))
528         return No.insertionUpdated;
529 
530     resolveOverlappingCropping(scaffold, contigJoin, incidentEdgesCache, tracePointDistance);
531 
532     return Yes.insertionUpdated;
533 }
534 
535 private void resolveOverlappingCropping(
536     ref OutputScaffold scaffold,
537     ref Insertion contigJoin,
538     OutputScaffold.IncidentEdgesCache incidentEdgesCache,
539     trace_point_t tracePointDistance,
540 )
541 {
542     assert(contigJoin.payload.spliceSites.length == 2);
543 
544     auto spliceSites = contigJoin.payload.spliceSites;
545     // Crop overlapping insertions to the center of the overlap
546     alias croppingRefPos = (spliceSite) => spliceSite.croppingRefPosition.value;
547     auto newCroppingPos = floor(mean(spliceSites.map!croppingRefPos), tracePointDistance);
548 
549     void resolveOverlappingCroppingFor(ref SpliceSite spliceSite)
550     {
551         auto alignmentSeed = spliceSite.alignmentSeed;
552         auto croppingDiff = absdiff(
553             newCroppingPos,
554             spliceSite.croppingRefPosition.value,
555         );
556 
557         if (croppingDiff > 0)
558         {
559             alias replace = OutputScaffold.ConflictStrategy.replace;
560 
561             auto contigNode = alignmentSeed == AlignmentLocationSeed.front
562                 ? contigJoin.start
563                 : contigJoin.end;
564             auto incidentInsertion = incidentEdgesCache[contigNode]
565                 .find!(insertion => !insertion.isOutputGap && (insertion.isGap || insertion.isExtension))
566                 .front;
567             auto insertionSpliceSiteIdx = incidentInsertion
568                 .payload
569                 .spliceSites
570                 .countUntil!(spliceSite => spliceSite.croppingRefPosition.contigId == contigNode.contigId);
571             auto insertionSpliceSite = &incidentInsertion
572                 .payload
573                 .spliceSites[insertionSpliceSiteIdx];
574             auto shouldCropBack = (alignmentSeed == AlignmentLocationSeed.front) ^
575                                   (insertionSpliceSite.flags.complement);
576 
577             // Crop insertion to new cropping position
578             incidentInsertion.payload.sequence = shouldCropBack
579                 ? incidentInsertion.payload.sequence[0 .. $ - croppingDiff]
580                 : incidentInsertion.payload.sequence[croppingDiff .. $];
581             insertionSpliceSite.croppingRefPosition.value = newCroppingPos;
582 
583             // Store updated insertion in scaffold and update the cache
584             scaffold.add!replace(incidentInsertion);
585             incidentEdgesCache[incidentInsertion.start]
586                 .replaceInPlace(incidentInsertion, incidentInsertion);
587             incidentEdgesCache[incidentInsertion.end]
588                 .replaceInPlace(incidentInsertion, incidentInsertion);
589 
590             // Update splice site of contigJoin
591             spliceSite.croppingRefPosition.value = newCroppingPos;
592         }
593     }
594 
595     foreach (ref spliceSite; spliceSites)
596         resolveOverlappingCroppingFor(spliceSite);
597 }