1 /** 2 This is the `output` command of `dentist`. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.output; 10 11 import dentist.commandline : DentistCommand, OptionsFor; 12 import dentist.common : ReferenceInterval, ReferencePoint; 13 import dentist.common.alignments : 14 AlignmentChain, 15 AlignmentLocationSeed, 16 id_t, 17 trace_point_t; 18 import dentist.common.binio : 19 CompressedSequence, 20 InsertionDb; 21 import dentist.common.insertions : 22 getInfoForExistingContig, 23 getInfoForGap, 24 getInfoForNewSequenceInsertion, 25 Insertion, 26 InsertionInfo, 27 isOutputGap, 28 isValidInsertion, 29 OutputScaffold, 30 SpliceSite; 31 import dentist.common.scaffold : 32 ContigNode, 33 ContigPart, 34 scaffoldStarts, 35 enforceJoinPolicy, 36 getUnkownJoin, 37 initScaffold, 38 isAntiParallel, 39 isDefault, 40 isExtension, 41 isFrontExtension, 42 isGap, 43 linearWalk, 44 normalizeUnkownJoins, 45 removeExtensions; 46 import dentist.dazzler : 47 ContigSegment, 48 GapSegment, 49 getNumContigs, 50 getFastaSequence, 51 getScaffoldStructure, 52 ScaffoldSegment; 53 import dentist.util.algorithm : replaceInPlace; 54 import dentist.util.fasta : complement, reverseComplementer; 55 import dentist.util.log; 56 import dentist.util.math : 57 absdiff, 58 add, 59 bulkAdd, 60 ceil, 61 floor, 62 mean; 63 import dentist.util.range : wrapLines; 64 import std.algorithm : 65 all, 66 canFind, 67 copy, 68 count, 69 countUntil, 70 filter, 71 find, 72 fold, 73 joiner, 74 map, 75 maxElement, 76 swap, 77 swapAt; 78 import std.array : array; 79 import std.conv : to; 80 import std.format : format; 81 import std.range : 82 enumerate, 83 only, 84 repeat, 85 takeExactly, 86 walkLength; 87 import std.range.primitives : empty, front, popFront, save; 88 import std.stdio : File, stderr, stdout; 89 import std.typecons : Flag, No, tuple, Tuple, Yes; 90 import vibe.data.json : toJson = serializeToJson; 91 92 93 /// Options for the `output` command. 94 alias Options = OptionsFor!(DentistCommand.output); 95 96 97 /// Execute the `output` command with `options`. 98 void execute(Options)(in Options options) 99 { 100 auto writer = new AssemblyWriter(options); 101 102 writer.run(); 103 } 104 105 class AssemblyWriter 106 { 107 alias FastaWriter = typeof(wrapLines(stdout.lockingTextWriter, 0)); 108 109 protected const Options options; 110 protected const(ScaffoldSegment)[] scaffoldStructure; 111 size_t numReferenceContigs; 112 size_t[] contigLengths; 113 OutputScaffold assemblyGraph; 114 OutputScaffold.IncidentEdgesCache incidentEdgesCache; 115 ContigNode[] scaffoldStartNodes; 116 File assemblyFile; 117 FastaWriter writer; 118 119 this(in ref Options options) 120 { 121 this.options = options; 122 this.assemblyFile = options.assemblyFile is null 123 ? stdout 124 : File(options.assemblyFile, "w"); 125 this.writer = wrapLines(assemblyFile.lockingTextWriter, options.fastaLineWidth); 126 } 127 128 void run() 129 { 130 mixin(traceExecution); 131 132 init(); 133 buildAssemblyGraph(); 134 scaffoldStartNodes = scaffoldStarts!InsertionInfo(assemblyGraph, incidentEdgesCache).array; 135 logStatistics(); 136 137 foreach (startNode; scaffoldStartNodes) 138 writeNewScaffold(startNode); 139 } 140 141 protected void init() 142 { 143 mixin(traceExecution); 144 145 numReferenceContigs = getNumContigs(options.refDb, options.workdir); 146 scaffoldStructure = getScaffoldStructure(options.refDb).array; 147 contigLengths = scaffoldStructure 148 .filter!(part => part.peek!ContigSegment !is null) 149 .map!(contigPart => contigPart.get!ContigSegment) 150 .map!(contigPart => contigPart.end - contigPart.begin + 0) 151 .array; 152 } 153 154 155 protected void buildAssemblyGraph() 156 { 157 mixin(traceExecution); 158 159 auto insertionDb = InsertionDb.parse(options.insertionsFile); 160 auto insertions = insertionDb[]; 161 162 assemblyGraph = initScaffold!( 163 contigId => InsertionInfo(CompressedSequence(), contigLengths[contigId - 1], []), 164 InsertionInfo, 165 )(numReferenceContigs); 166 assemblyGraph.bulkAdd!(joins => mergeInsertions(joins))(insertions); 167 appendUnkownJoins(); 168 169 if (!options.shouldExtendContigs) 170 { 171 assemblyGraph = assemblyGraph.removeExtensions!InsertionInfo(); 172 } 173 174 assemblyGraph = assemblyGraph 175 .enforceJoinPolicy!InsertionInfo(options.joinPolicy) 176 .normalizeUnkownJoins!InsertionInfo() 177 .fixCropping(options.tracePointDistance); 178 incidentEdgesCache = assemblyGraph.allIncidentEdges(); 179 180 if (options.assemblyGraphFile !is null) 181 InsertionDb.write(options.assemblyGraphFile, assemblyGraph.edges); 182 } 183 184 185 protected void appendUnkownJoins() 186 { 187 auto unkownJoins = scaffoldStructure[] 188 .filter!(part => part.peek!GapSegment !is null) 189 .map!(gapPart => gapPart.get!GapSegment) 190 .map!(gapPart => getUnkownJoin( 191 gapPart.beginGlobalContigId, 192 gapPart.endGlobalContigId, 193 InsertionInfo(CompressedSequence(), gapPart.length, []), 194 )); 195 assemblyGraph.bulkAddForce(unkownJoins); 196 } 197 198 void logStatistics() 199 { 200 mixin(traceExecution); 201 202 // Due to several transformations of the assembly graph the predicates 203 // for "join types" must be combined to yield the expected result. 204 alias _isSpannedGap = j => j.isGap && !(j.isOutputGap || j.isDefault); 205 alias _isExtension = j => !j.isGap && !(j.isOutputGap || j.isDefault); 206 alias _isRemaingGap = isOutputGap; 207 alias _isExistingContig = isDefault; 208 209 logJsonInfo( 210 "numGraphEdges", assemblyGraph.edges.length, 211 "numScaffolds", scaffoldStartNodes.length, 212 "numSpannedGaps", assemblyGraph.edges.count!_isSpannedGap, 213 "numExtensions", assemblyGraph.edges.count!_isExtension, 214 "numRemainingGaps", assemblyGraph.edges.count!_isRemaingGap, 215 "numExistingContigs", assemblyGraph.edges.count!_isExistingContig, 216 ); 217 218 if (shouldLog(LogLevel.diagnostic)) 219 logSparseInsertionWalks(); 220 221 debug logJsonDebug( 222 "insertionWalks", scaffoldStartNodes 223 .map!(startNode => linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache) 224 .map!(join => [ 225 "start": join.start.toJson, 226 "end": join.end.toJson, 227 "payload": [ 228 "sequence": join.payload.sequence.to!string.toJson, 229 "contigLength": join.payload.contigLength.toJson, 230 "spliceSites": join.payload.spliceSites.toJson, 231 ].toJson, 232 ]) 233 .array 234 ) 235 .array 236 .toJson 237 ); 238 } 239 240 void logSparseInsertionWalks() 241 { 242 stderr.write(`{"scaffolds":[`); 243 244 foreach (i, startNode; scaffoldStartNodes) 245 { 246 if (i == 0) 247 stderr.write(`[`); 248 else 249 stderr.write(`,[`); 250 251 foreach (enumJoin; enumerate(linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache))) 252 { 253 auto j = enumJoin.index; 254 auto join = enumJoin.value; 255 256 if (j > 0) 257 stderr.write(`,`); 258 259 if (join.isDefault) 260 stderr.writef!`{"action":"copy","contigId":%d}`(join.start.contigId); 261 else if (join.isOutputGap) 262 stderr.writef!`{"action":"gap","length":%d}`(join.payload.contigLength); 263 else if (join.isGap) 264 stderr.writef!`{"action":"insert","contigIds":[%d,%d],"length":%d}`( 265 join.start.contigId, 266 join.end.contigId, 267 join.payload.sequence.length 268 ); 269 else if (join.isExtension) 270 stderr.writef!`{"action":"insert","contigIds":[%d],"contigPart":"%s","length":%d}`( 271 join.start.contigId, 272 join.isFrontExtension ? "front" : "back", 273 join.payload.sequence.length 274 ); 275 else 276 assert(0, "unexpected join type"); 277 } 278 279 stderr.write(`]`); 280 } 281 282 stderr.writeln(`]}`); 283 } 284 285 void writeNewScaffold(ContigNode startNode) 286 { 287 mixin(traceExecution); 288 289 auto globalComplement = false; 290 auto insertionBegin = startNode; 291 292 logJsonDebug( 293 "info", "writing scaffold", 294 "scaffoldId", startNode.contigId, 295 ); 296 297 writeHeader(startNode); 298 foreach (currentInsertion; linearWalk!InsertionInfo(assemblyGraph, startNode, incidentEdgesCache)) 299 { 300 writeInsertion( 301 insertionBegin, 302 currentInsertion, 303 globalComplement, 304 ); 305 306 insertionBegin = currentInsertion.target(insertionBegin); 307 if (currentInsertion.isAntiParallel) 308 globalComplement = !globalComplement; 309 } 310 "\n".copy(writer); 311 } 312 313 Insertion mergeInsertions(Insertion[] insertionsChunk) 314 { 315 assert(insertionsChunk.length > 0); 316 317 if (insertionsChunk[0].isDefault) 318 { 319 auto merged = insertionsChunk[0]; 320 321 merged.payload.contigLength = insertionsChunk 322 .map!"a.payload.contigLength" 323 .maxElement; 324 assert(insertionsChunk 325 .all!(a => a.payload.contigLength == 0 || 326 a.payload.contigLength == merged.payload.contigLength)); 327 merged.payload.spliceSites = insertionsChunk 328 .map!"a.payload.spliceSites" 329 .joiner 330 .array; 331 332 return merged; 333 } 334 else 335 { 336 assert(insertionsChunk.length == 1); 337 338 return insertionsChunk[0]; 339 } 340 } 341 342 protected void writeHeader(in ContigNode begin) 343 { 344 format!">scaffold-%d\n"(begin.contigId).copy(writer); 345 } 346 347 protected void writeInsertion( 348 in ContigNode begin, 349 in Insertion insertion, 350 in bool globalComplement, 351 ) 352 { 353 assert(insertion.isValidInsertion, "invalid insertion"); 354 355 if (insertion.isDefault) 356 writeExistingContig(begin, insertion, globalComplement); 357 else if (insertion.isOutputGap) 358 writeGap(insertion); 359 else 360 writeNewSequenceInsertion(begin, insertion, globalComplement); 361 } 362 363 protected void writeExistingContig( 364 in ContigNode begin, 365 in Insertion insertion, 366 in bool globalComplement, 367 ) 368 { 369 auto insertionInfo = getInfoForExistingContig(begin, insertion, globalComplement); 370 371 auto contigSequence = getFastaSequence(options.refDb, insertionInfo.contigId, options.workdir); 372 contigSequence = contigSequence[insertionInfo.spliceStart .. insertionInfo.spliceEnd]; 373 374 logJsonDebug( 375 "info", "writing contig insertion", 376 "contigId", insertionInfo.contigId, 377 "contigLength", insertion.payload.contigLength, 378 "spliceStart", insertionInfo.spliceStart, 379 "spliceEnd", insertionInfo.spliceEnd, 380 "spliceSites", insertion.payload.spliceSites.toJson, 381 "start", insertion.start.toJson, 382 "end", insertion.end.toJson, 383 ); 384 385 if (insertionInfo.complement) 386 contigSequence.reverseComplementer.copy(writer); 387 else 388 contigSequence.copy(writer); 389 } 390 391 protected void writeGap( 392 in Insertion insertion, 393 ) 394 { 395 auto insertionInfo = getInfoForGap(insertion); 396 397 logJsonDebug( 398 "info", "writing gap", 399 "gapLength", insertionInfo.length, 400 "start", insertion.start.toJson, 401 "end", insertion.end.toJson, 402 ); 403 404 enum char unkownBase = 'n'; 405 unkownBase 406 .repeat 407 .takeExactly(insertionInfo.length) 408 .copy(writer); 409 } 410 411 protected void writeNewSequenceInsertion( 412 in ContigNode begin, 413 in Insertion insertion, 414 in bool globalComplement, 415 ) 416 { 417 auto insertionInfo = getInfoForNewSequenceInsertion(begin, insertion, globalComplement); 418 419 logJsonDebug( 420 "info", "writing new sequence insertion", 421 "type", insertion.isGap ? "gap" : "extension", 422 "insertionLength", insertion.payload.sequence.length, 423 "isAntiParallel", insertion.isAntiParallel, 424 "localComplement", insertion.payload.spliceSites[0].flags.complement, 425 "start", insertion.start.toJson, 426 "end", insertion.end.toJson, 427 ); 428 429 if (insertionInfo.complement) 430 insertionInfo.sequence.bases!(char, Yes.reverse).map!(complement!char).copy(writer); 431 else 432 insertionInfo.sequence.bases!char.copy(writer); 433 434 } 435 } 436 437 438 /// Remove contig cropping where no new sequence is to be inserted and adjust 439 /// cropping where cropped regions overlap. 440 OutputScaffold fixCropping( 441 OutputScaffold scaffold, 442 trace_point_t tracePointDistance, 443 ) 444 { 445 mixin(traceExecution); 446 447 alias replace = OutputScaffold.ConflictStrategy.replace; 448 auto contigJoins = scaffold.edges.filter!isDefault; 449 auto incidentEdgesCache = scaffold.allIncidentEdges(); 450 451 foreach (contigJoin; contigJoins) 452 { 453 auto insertionUpdated1 = transferCroppingFromIncidentJoins(contigJoin, incidentEdgesCache); 454 auto insertionUpdated2 = resolveCroppingConflicts(scaffold, contigJoin, incidentEdgesCache, 455 tracePointDistance); 456 457 version (assert) 458 { 459 auto spliceSites = contigJoin.payload.spliceSites; 460 461 assert(spliceSites.length <= 2); 462 if (spliceSites.length == 2) 463 assert( 464 spliceSites[0].croppingRefPosition.value == spliceSites[1].croppingRefPosition.value || 465 (spliceSites[0].croppingRefPosition.value < spliceSites[1].croppingRefPosition.value) 466 == 467 (spliceSites[0].alignmentSeed < spliceSites[1].alignmentSeed) 468 ); 469 } 470 471 if (insertionUpdated1 || insertionUpdated2) 472 scaffold.add!replace(contigJoin); 473 } 474 475 476 return scaffold; 477 } 478 479 Flag!"insertionUpdated" transferCroppingFromIncidentJoins( 480 ref Insertion contigJoin, 481 OutputScaffold.IncidentEdgesCache incidentEdgesCache, 482 ) 483 { 484 SpliceSite spliceSiteFromIncidentJoins(ContigNode contigNode) 485 { 486 alias isSomeInsertion = insertion => 487 !insertion.isOutputGap && (insertion.isGap || insertion.isExtension); 488 489 auto incidentJoins = incidentEdgesCache[contigNode].filter!isSomeInsertion; 490 491 if (incidentJoins.empty) 492 return SpliceSite(); 493 494 assert(incidentJoins.walkLength(2) == 1, "non-linearized scaffold"); 495 auto spliceSitesForContigNode = incidentJoins 496 .front 497 .payload 498 .spliceSites 499 .filter!(spliceSite => spliceSite.croppingRefPosition.contigId == contigNode.contigId); 500 assert(!spliceSitesForContigNode.empty, "missing splice site"); 501 assert(spliceSitesForContigNode.walkLength(2) <= 1, "too many splice sites"); 502 503 return spliceSitesForContigNode.front; 504 } 505 506 contigJoin.payload.spliceSites = only(contigJoin.start, contigJoin.end) 507 .map!spliceSiteFromIncidentJoins 508 .filter!(spliceSite => spliceSite != SpliceSite.init) 509 .array; 510 511 return cast(typeof(return)) (contigJoin.payload.spliceSites.length > 0); 512 } 513 514 Flag!"insertionUpdated" resolveCroppingConflicts( 515 ref OutputScaffold scaffold, 516 ref Insertion contigJoin, 517 OutputScaffold.IncidentEdgesCache incidentEdgesCache, 518 trace_point_t tracePointDistance, 519 ) 520 { 521 auto spliceSites = contigJoin.payload.spliceSites; 522 523 if (spliceSites.length <= 1) 524 return No.insertionUpdated; 525 526 alias croppingRefPos = (spliceSite) => spliceSite.croppingRefPosition.value; 527 if (croppingRefPos(spliceSites[0]) <= croppingRefPos(spliceSites[1])) 528 return No.insertionUpdated; 529 530 resolveOverlappingCropping(scaffold, contigJoin, incidentEdgesCache, tracePointDistance); 531 532 return Yes.insertionUpdated; 533 } 534 535 private void resolveOverlappingCropping( 536 ref OutputScaffold scaffold, 537 ref Insertion contigJoin, 538 OutputScaffold.IncidentEdgesCache incidentEdgesCache, 539 trace_point_t tracePointDistance, 540 ) 541 { 542 assert(contigJoin.payload.spliceSites.length == 2); 543 544 auto spliceSites = contigJoin.payload.spliceSites; 545 // Crop overlapping insertions to the center of the overlap 546 alias croppingRefPos = (spliceSite) => spliceSite.croppingRefPosition.value; 547 auto newCroppingPos = floor(mean(spliceSites.map!croppingRefPos), tracePointDistance); 548 549 void resolveOverlappingCroppingFor(ref SpliceSite spliceSite) 550 { 551 auto alignmentSeed = spliceSite.alignmentSeed; 552 auto croppingDiff = absdiff( 553 newCroppingPos, 554 spliceSite.croppingRefPosition.value, 555 ); 556 557 if (croppingDiff > 0) 558 { 559 alias replace = OutputScaffold.ConflictStrategy.replace; 560 561 auto contigNode = alignmentSeed == AlignmentLocationSeed.front 562 ? contigJoin.start 563 : contigJoin.end; 564 auto incidentInsertion = incidentEdgesCache[contigNode] 565 .find!(insertion => !insertion.isOutputGap && (insertion.isGap || insertion.isExtension)) 566 .front; 567 auto insertionSpliceSiteIdx = incidentInsertion 568 .payload 569 .spliceSites 570 .countUntil!(spliceSite => spliceSite.croppingRefPosition.contigId == contigNode.contigId); 571 auto insertionSpliceSite = &incidentInsertion 572 .payload 573 .spliceSites[insertionSpliceSiteIdx]; 574 auto shouldCropBack = (alignmentSeed == AlignmentLocationSeed.front) ^ 575 (insertionSpliceSite.flags.complement); 576 577 // Crop insertion to new cropping position 578 incidentInsertion.payload.sequence = shouldCropBack 579 ? incidentInsertion.payload.sequence[0 .. $ - croppingDiff] 580 : incidentInsertion.payload.sequence[croppingDiff .. $]; 581 insertionSpliceSite.croppingRefPosition.value = newCroppingPos; 582 583 // Store updated insertion in scaffold and update the cache 584 scaffold.add!replace(incidentInsertion); 585 incidentEdgesCache[incidentInsertion.start] 586 .replaceInPlace(incidentInsertion, incidentInsertion); 587 incidentEdgesCache[incidentInsertion.end] 588 .replaceInPlace(incidentInsertion, incidentInsertion); 589 590 // Update splice site of contigJoin 591 spliceSite.croppingRefPosition.value = newCroppingPos; 592 } 593 } 594 595 foreach (ref spliceSite; spliceSites) 596 resolveOverlappingCroppingFor(spliceSite); 597 }