1 /** 2 This is the `checkResults` command of `dentist`. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.checkResults; 10 11 import dentist.common : isTesting; 12 13 static if (isTesting): 14 15 import core.atomic : atomicOp; 16 import dentist.commandline : TestingCommand, OptionsFor; 17 import dentist.common : 18 ReferenceInterval, 19 ReferenceRegion, 20 to; 21 import dentist.common.alignments : 22 AlignmentChain, 23 coord_t, 24 id_t; 25 import dentist.dazzler : 26 ContigSegment, 27 fingerprint, 28 GapSegment, 29 getAlignments, 30 getExactAlignment, 31 getScaffoldStructure, 32 readMask, 33 ScaffoldSegment; 34 import dentist.util.algorithm : 35 cmpLexicographically, 36 first, 37 last, 38 sliceBy; 39 import dentist.util.log; 40 import dentist.util.math : 41 absdiff, 42 ceildiv, 43 longestIncreasingSubsequence, 44 mean, 45 median, 46 N, 47 NaturalNumberSet; 48 import dentist.util.range : tupleMap; 49 import std.algorithm : 50 all, 51 among, 52 chunkBy, 53 copy, 54 count, 55 filter, 56 find, 57 fold, 58 joiner, 59 map, 60 max, 61 maxElement, 62 min, 63 remove, 64 sort, 65 sum, 66 uniq; 67 import std.array : appender, array; 68 import std.format : format; 69 import std.math : 70 ceil, 71 log10; 72 import std.parallelism : parallel; 73 import std.range : 74 assumeSorted, 75 chain, 76 enumerate, 77 only, 78 StoppingPolicy, 79 zip; 80 import std.range.primitives; 81 import std.regex : ctRegex, replaceAll; 82 import std.stdio : File, writeln; 83 import std..string : join; 84 import std.typecons : Tuple; 85 import vibe.data.json : Json, toJson = serializeToJson, toJsonString = serializeToPrettyJson; 86 87 88 /// Options for the `collectPileUps` command. 89 alias Options = OptionsFor!(TestingCommand.checkResults); 90 91 /// Execute the `checkResults` command with `options`. 92 void execute(in Options options) 93 { 94 auto analyzer = ResultAnalyzer(options); 95 auto stats = analyzer.collect(); 96 97 if (options.useJson) 98 writeln(stats.toJsonString()); 99 else 100 writeln(stats.toTabular()); 101 } 102 103 private struct ResultAnalyzer 104 { 105 alias ExactAlignment = typeof(getExactAlignment("", "", resultAlignment[0], "")); 106 alias Fingerprint = typeof(fingerprint(&resultAlignment[0])); 107 static enum identityLevels = Stats.identityLevels; 108 109 const(Options) options; 110 protected const(ScaffoldSegment)[] trueAssemblyScaffoldStructure; 111 protected const(ScaffoldSegment)[] resultScaffoldStructure; 112 protected coord_t referenceOffset; 113 protected AlignmentChain[] resultAlignment; 114 protected ExactAlignment[Fingerprint] exactResultAlignments; 115 protected ReferenceRegion mappedRegionsMask; 116 protected ReferenceRegion referenceGaps; 117 protected ReferenceRegion reconstructedRegions; 118 protected ReferenceRegion[] reconstructedGaps; 119 protected size_t[][identityLevels.length] correctContigsPerIdentityLevel; 120 protected size_t[][identityLevels.length] correctGapsPerIdentityLevel; 121 122 Stats collect() 123 { 124 mixin(traceExecution); 125 126 init(); 127 128 if (options.alignmentTabular !is null) 129 writeAlignmentTabular( 130 File(options.alignmentTabular, "w"), 131 resultAlignment, 132 ); 133 134 Stats stats; 135 136 stats.numBpsExpected = getNumBpsExpected(); 137 stats.numBpsKnown = getNumBpsKnown(); 138 stats.numBpsResult = getNumBpsResult(); 139 stats.numTranslocatedGaps = getNumTranslocatedGaps(); 140 stats.numBpsCorrect = getNumBpsCorrect(); 141 stats.numContigsExpected = getNumContigsExpected(); 142 stats.numCorrectContigs = getNumCorrectContigs(); 143 stats.numCorrectGaps = getNumCorrectGaps(); 144 stats.numClosedGaps = getNumClosedGaps(); 145 stats.numPartlyClosedGaps = getNumPartlyClosedGaps(); 146 stats.numUnaddressedGaps = getNumUnaddressedGaps(); 147 stats.numBpsInClosedGaps = getNumBpsInClosedGaps(); 148 stats.numBpsInGaps = getNumBpsInGaps(); 149 stats.maximumN50 = getMaximumN50(); 150 stats.inputN50 = getInputN50(); 151 stats.resultN50 = getResultN50(); 152 stats.gapMedian = getGapMedian(); 153 stats.closedGapMedian = getClosedGapMedian(); 154 stats.maxClosedGap = getMaxClosedGap(); 155 if (options.bucketSize > 0) 156 { 157 stats.correctGapLengthHistograms = getCorrectGapLengthHistograms(); 158 stats.closedGapLengthHistogram = getClosedGapLengthHistogram(); 159 stats.gapLengthHistogram = getGapLengthHistogram(); 160 } 161 162 return stats; 163 } 164 165 void init() 166 { 167 trueAssemblyScaffoldStructure = getScaffoldStructure(options.trueAssemblyDb).array; 168 resultScaffoldStructure = getScaffoldStructure(options.resultDb).array; 169 if (options.onlyContigId == 0) 170 resultAlignment = cleanUpAlignments(getAlignments( 171 options.trueAssemblyDb, 172 options.resultDb, 173 options.resultsAlignmentFile, 174 options.workdir, 175 options.tracePointDistance, 176 )); 177 else 178 resultAlignment = cleanUpAlignments(getAlignments( 179 options.trueAssemblyDb, 180 options.resultDb, 181 options.resultsAlignmentFile, 182 options.workdir, 183 options.tracePointDistance, 184 ).filter!(ac => ac.contigA.id == options.onlyContigId).array); 185 mappedRegionsMask = ReferenceRegion(readMask!ReferenceInterval( 186 options.trueAssemblyDb, 187 options.mappedRegionsMask, 188 options.workdir, 189 )); 190 referenceOffset = cast(coord_t) mappedRegionsMask.intervals[0].begin; 191 debug logJsonDebug("referenceOffset", referenceOffset); 192 referenceGaps = getReferenceGaps(); 193 reconstructedRegions = getReconstructedRegions(); 194 reconstructedGaps = getReconstructedGaps(); 195 debug logJsonDebug( 196 "referenceGaps", referenceGaps.toJson, 197 "reconstructedRegions", reconstructedRegions.toJson, 198 "reconstructedGaps", reconstructedGaps.toJson, 199 ); 200 calculateExactResultAlignments(); 201 correctContigsPerIdentityLevel = getCorrectRegions(mappedRegionsMask.intervals); 202 if (options.gapDetailsTabular !is null) 203 correctGapsPerIdentityLevel = getCorrectRegions( 204 referenceGaps 205 .intervals 206 .array, 207 File(options.gapDetailsTabular, "w"), 208 ); 209 else 210 correctGapsPerIdentityLevel = getCorrectRegions( 211 referenceGaps 212 .intervals 213 .array, 214 ); 215 } 216 217 AlignmentChain[] cleanUpAlignments(AlignmentChain[] localAlignments) 218 { 219 mixin(traceExecution); 220 221 static struct AlignmentEvent 222 { 223 size_t contigId; 224 size_t position; 225 int diff; 226 AlignmentChain* alignmentChain; 227 228 int opCmp(in AlignmentEvent other) const pure nothrow 229 { 230 return cmpLexicographically!( 231 const(AlignmentEvent), 232 e => e.contigId, 233 e => e.position, 234 e => -e.diff, 235 )(this, other); 236 } 237 } 238 239 // 1. Prepare alignmentEvents 240 auto alignmentEvents = localAlignments 241 .map!((ref alignment) => only( 242 AlignmentEvent(alignment.contigA.id, alignment.first.contigA.begin, 1, &alignment), 243 AlignmentEvent(alignment.contigA.id, alignment.last.contigA.end, -1, &alignment), 244 )) 245 .joiner; 246 auto contigBoundaryEvents = localAlignments 247 .map!"a.contigA" 248 .uniq 249 .map!(contig => only( 250 AlignmentEvent(contig.id, 0, 0, null), 251 AlignmentEvent(contig.id, contig.length, 0, null), 252 )) 253 .joiner; 254 auto changeEvents = chain(alignmentEvents, contigBoundaryEvents).array; 255 changeEvents.sort(); 256 257 if (changeEvents.length == 0) 258 return localAlignments; 259 260 // 2. Find overlapping alignments 261 int currentCoverage; 262 auto overlappingAlignments = appender!(AlignmentChain*[]); 263 auto goodAlignments = appender!(AlignmentChain*[]); 264 foreach (changeEvent; changeEvents) 265 { 266 currentCoverage += changeEvent.diff; 267 268 if (currentCoverage == 0 && overlappingAlignments.data.length > 0) 269 { 270 // Collected local group of overlapping alignments 271 272 auto locallyCleanedUpAlignments = overlappingAlignments.data.length == 1 273 ? overlappingAlignments.data 274 : locallyCleanUpAlignments(overlappingAlignments.data); 275 logJsonDebug( 276 "overlappingAlignments", overlappingAlignments.data.map!"*a".array.toJson, 277 "locallyCleanedUpAlignments", locallyCleanedUpAlignments.map!"*a".array.toJson, 278 ); 279 goodAlignments ~= locallyCleanedUpAlignments; 280 281 overlappingAlignments.clear(); 282 } 283 else if (currentCoverage >= 1 && changeEvent.diff > 0) 284 { 285 // Found overlapping alignments 286 assert(changeEvent.alignmentChain !is null); 287 overlappingAlignments ~= changeEvent.alignmentChain; 288 } 289 } 290 291 logJsonDiagnostic( 292 "numRawLocalAlignments", localAlignments.length, 293 "numCleanedLocalAlignments", goodAlignments.data.length, 294 ); 295 296 return goodAlignments.data.map!"*a".array; 297 } 298 299 AlignmentChain*[] locallyCleanUpAlignments(AlignmentChain*[] overlappingAlignments) 300 { 301 ReferenceInterval interval(in AlignmentChain* ac) pure 302 { 303 return ReferenceInterval( 304 ac.contigA.id, 305 ac.first.contigA.begin, 306 ac.last.contigA.end, 307 ); 308 } 309 310 alias containedWithin = (lhs, rhs) => interval(lhs) in interval(rhs); 311 alias overlapEachOther = (lhs, rhs) => 312 interval(lhs).intersects(interval(rhs)); 313 314 // 1. Remove LAs completely included in others 315 foreach (ac; overlappingAlignments) 316 foreach (other; overlappingAlignments) 317 ac.disableIf(!other.flags.disabled && containedWithin(ac, other)); 318 319 // 2. Find combination of alignments, s.t. 320 // - they are not disabled 321 // - they do not overlap 322 // - if more than one: cover maximum possible area 323 // - if more than one: have best quality 324 325 static struct ViableCombination 326 { 327 ReferenceRegion region; 328 AlignmentChain*[] acs; 329 } 330 331 auto viableCombinationsAcc = appender!(ViableCombination[]); 332 333 void findNonOverlappingCombinations(AlignmentChain*[] candidates, ReferenceRegion region, AlignmentChain*[] included) 334 { 335 size_t numExpansions; 336 foreach (i, candidate; candidates) 337 { 338 auto candidateInterval = interval(candidate); 339 auto newRegion = region | candidateInterval; 340 341 if (newRegion.size == region.size + candidateInterval.size) 342 { 343 findNonOverlappingCombinations( 344 candidates.remove(i), 345 newRegion, 346 included ~ [candidate], 347 ); 348 ++numExpansions; 349 } 350 } 351 352 if (numExpansions == 0) 353 viableCombinationsAcc ~= ViableCombination( 354 region, 355 included, 356 ); 357 } 358 359 findNonOverlappingCombinations( 360 overlappingAlignments.filter!"!a.flags.disabled".array, 361 ReferenceRegion(), 362 [], 363 ); 364 365 assert(viableCombinationsAcc.data.length > 0); 366 367 return viableCombinationsAcc 368 .data 369 .maxElement!(vc => vc.region.size - vc.acs.map!"a.totalDiffs".sum.to!double) 370 .acs; 371 } 372 373 void calculateExactResultAlignments() 374 { 375 mixin(traceExecution); 376 377 foreach (alignmentChain; parallel(resultAlignment)) 378 { 379 auto acFingerprint = fingerprint(&alignmentChain); 380 auto exactAlignment = getExactAlignment( 381 options.trueAssemblyDb, 382 options.resultDb, 383 alignmentChain, 384 options.workdir, 385 10*2^^30, 386 ); 387 388 synchronized 389 exactResultAlignments[acFingerprint] = exactAlignment; 390 } 391 } 392 393 ReferenceRegion getReferenceGaps() 394 { 395 mixin(traceExecution); 396 397 alias scaffoldRegion = (contigPart) => ReferenceRegion(ReferenceInterval( 398 contigPart.globalContigId, 399 0, 400 contigPart.length, 401 )); 402 alias mappedGapsMask = (contigPart) => scaffoldRegion(contigPart) - mappedRegionsMask; 403 404 return ReferenceRegion(trueAssemblyScaffoldStructure 405 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 406 .map!(contigPart => contigPart.get!ContigSegment) 407 .map!(contigPart => cast(ReferenceInterval[]) mappedGapsMask(contigPart).intervals) 408 .joiner 409 .filter!(gap => isInnerGap(gap)) 410 .array 411 ); 412 } 413 414 ReferenceRegion getReconstructedRegions() 415 { 416 mixin(traceExecution); 417 418 return ReferenceRegion(resultAlignment 419 .map!(ac => ac 420 .localAlignments 421 .map!(la => ReferenceInterval( 422 ac.contigA.id, 423 la.contigA.begin, 424 la.contigA.end, 425 ))) 426 .joiner 427 .array); 428 } 429 430 ReferenceRegion[] getReconstructedGaps() 431 { 432 mixin(traceExecution); 433 434 return referenceGaps 435 .intervals 436 .map!(gap => reconstructedRegions & gap) 437 .array; 438 } 439 440 void writeAlignmentTabular(File tabularFile, AlignmentChain[] alignmentChains) 441 { 442 coord_t contigBId; 443 coord_t contigBOffset; 444 foreach (alignmentChain; alignmentChains) 445 { 446 if (contigBId != alignmentChain.contigB.id) 447 { 448 contigBId = alignmentChain.contigB.id; 449 contigBOffset = getResultContigBegin(contigBId); 450 } 451 452 alias percentSimilarity = (la) => 1.0 - cast(double) la.numDiffs / 453 cast(double) (la.contigA.end - la.contigA.begin); 454 455 foreach (i, localAlignment; alignmentChain.localAlignments) 456 { 457 tabularFile.writefln!"%12d %12d %12d %12d %12f %d"( 458 alignmentChain.contigA.id, 459 alignmentChain.contigB.id, 460 localAlignment.contigA.begin, 461 alignmentChain.flags.complement 462 ? alignmentChain.contigB.length - (contigBOffset + localAlignment.contigB.end) 463 : contigBOffset + localAlignment.contigB.begin, 464 0.0, 465 alignmentChain.flags.complement ? -1 : 1, 466 ); 467 tabularFile.writefln!"%12d %12d %12d %12d %12f %d"( 468 alignmentChain.contigA.id, 469 alignmentChain.contigB.id, 470 localAlignment.contigA.end, 471 alignmentChain.flags.complement 472 ? alignmentChain.contigB.length - (contigBOffset + localAlignment.contigB.begin) 473 : contigBOffset + localAlignment.contigB.end, 474 percentSimilarity(localAlignment), 475 alignmentChain.flags.complement ? -1 : 1, 476 ); 477 } 478 tabularFile.writeln(); 479 } 480 } 481 482 coord_t getResultContigBegin(id_t contigId) 483 { 484 return cast(coord_t) (resultScaffoldStructure 485 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 486 .map!(contigPart => contigPart.peek!ContigSegment) 487 .find!(contigPart => contigPart.globalContigId == contigId) 488 .map!(contigPart => contigPart.begin) 489 .front + 0); 490 } 491 492 size_t getNumBpsExpected() 493 { 494 mixin(traceExecution); 495 496 return mappedRegionsMask 497 .intervals 498 .sliceBy!"a.contigId == b.contigId" 499 .map!(trueScaffoldSlice => trueScaffoldSlice[$ - 1].end - trueScaffoldSlice[0].begin) 500 .sum; 501 } 502 503 size_t getNumBpsKnown() 504 { 505 mixin(traceExecution); 506 507 return mappedRegionsMask.size; 508 } 509 510 size_t getNumBpsResult() 511 { 512 mixin(traceExecution); 513 514 return resultScaffoldStructure 515 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 516 .map!(contigPart => contigPart.peek!ContigSegment.length) 517 .sum; 518 } 519 520 size_t getNumBpsCorrect() 521 { 522 mixin(traceExecution); 523 524 return resultAlignment 525 .map!(ac => ac 526 .localAlignments 527 .map!(la => la.contigA.end - la.contigA.begin - la.numDiffs)) 528 .joiner 529 .sum; 530 } 531 532 size_t getNumTranslocatedGaps() 533 { 534 mixin(traceExecution); 535 536 version (assert) 537 { 538 foreach (pair; zip(referenceGaps.intervals, reconstructedGaps)) 539 assert( 540 isGapClosed(pair) ^ isGapPartlyClosed(pair) ^ isGapUnaddressed(pair), 541 format!"%s-%s-%s"( 542 isGapClosed(pair), 543 isGapPartlyClosed(pair), 544 isGapUnaddressed(pair), 545 ), 546 ); 547 } 548 549 return referenceGaps.intervals.length; 550 } 551 552 size_t getNumContigsExpected() 553 { 554 mixin(traceExecution); 555 556 return mappedRegionsMask.intervals.length; 557 } 558 559 size_t getNumCorrectContigs() 560 { 561 return correctContigsPerIdentityLevel[0].length; 562 } 563 564 size_t getNumCorrectGaps() 565 { 566 return correctGapsPerIdentityLevel[0].length; 567 } 568 569 size_t[][identityLevels.length] getCorrectRegions(in ReferenceInterval[] intervals, File detailsTabular = File()) 570 { 571 if (detailsTabular.isOpen) 572 detailsTabular.writefln!"%10s %10s %10s %10s %10s %s"( 573 "contigId", 574 "begin", 575 "end", 576 "numDiffs", 577 "percentIdentity", 578 "sequenceAlignment", 579 ); 580 581 auto sortedResultAlignment = resultAlignment.assumeSorted!"a.contigA.id < b.contigA.id"; 582 583 size_t[][identityLevels.length] identicalLengthsPerLevel; 584 foreach (identicalLengths; identicalLengthsPerLevel) 585 identicalLengths.reserve(intervals.length); 586 587 foreach (interval; parallel(intervals)) 588 { 589 auto overlappingAlignments = sortedResultAlignment 590 .equalRange(getDummyAC(interval)); 591 592 auto overlappedRegion = overlappingAlignments 593 .map!(to!(ReferenceRegion, "contigA")) 594 .fold!"a | b"(ReferenceRegion()); 595 size_t numDiffs = (ReferenceRegion(interval) - overlappedRegion).size; 596 auto alignmentString = appender!string; 597 598 foreach (overlappingAlignment; overlappingAlignments) 599 { 600 auto alignmentBegin = overlappingAlignment.first.contigA.begin; 601 auto alignmentEnd = overlappingAlignment.last.contigA.end; 602 if (interval.end <= alignmentBegin || alignmentEnd <= interval.begin) 603 continue; // Ignore non-overlaps 604 auto overlapBegin = interval.begin > alignmentBegin 605 ? interval.begin - alignmentBegin 606 : 0; 607 auto overlapEnd = interval.end < alignmentEnd 608 ? interval.end - alignmentBegin 609 : alignmentEnd - alignmentBegin; 610 611 if (overlappingAlignment.flags.complement) 612 { 613 // Inversions are not allowed 614 numDiffs += overlapEnd - overlapBegin; 615 } 616 else 617 { 618 auto exactAlignment = getOverlapAlignment(overlappingAlignment); 619 auto overlappingExactAlignment = exactAlignment[overlapBegin..min(overlapEnd, $)]; 620 621 numDiffs += overlappingExactAlignment.score; 622 623 if (detailsTabular.isOpen) 624 alignmentString ~= overlappingExactAlignment 625 .toString() 626 .replaceAll(ctRegex!(r"\n", "g"), "\\n"); 627 } 628 if (detailsTabular.isOpen) 629 alignmentString ~= "\\n\\n"; 630 } 631 632 if (detailsTabular.isOpen) 633 synchronized detailsTabular.writefln!"%10d %10d %10d %10d %.8f %s"( 634 interval.contigId, 635 interval.begin, 636 interval.end, 637 numDiffs, 638 1.0 - (numDiffs.to!double / interval.size.to!double), 639 alignmentString.data, 640 ); 641 642 foreach (i, identityLevel; identityLevels) 643 if (numDiffs <= (1.0 - identityLevel) * interval.size) 644 synchronized identicalLengthsPerLevel[i] ~= interval.size; 645 } 646 647 return identicalLengthsPerLevel; 648 } 649 650 auto getOverlapAlignment(in AlignmentChain alignmentChain) 651 { 652 auto acFingerprint = fingerprint(&alignmentChain); 653 654 return exactResultAlignments[acFingerprint]; 655 } 656 657 static AlignmentChain getDummyAC(in ReferenceInterval refInterval) 658 { 659 return AlignmentChain( 660 0, 661 AlignmentChain.Contig( 662 cast(id_t) refInterval.contigId, 663 cast(coord_t) refInterval.end, 664 ), 665 AlignmentChain.Contig(0, 1), 666 AlignmentChain.emptyFlags, 667 [AlignmentChain.LocalAlignment( 668 AlignmentChain.LocalAlignment.Locus( 669 cast(coord_t) refInterval.begin, 670 cast(coord_t) refInterval.end, 671 ), 672 AlignmentChain.LocalAlignment.Locus( 673 cast(coord_t) 0, 674 cast(coord_t) 1, 675 ), 676 )], 677 ); 678 } 679 680 bool isInnerGap(in ReferenceInterval gap) 681 { 682 return 0 < gap.begin && gap.end < trueScaffoldLength(cast(id_t) gap.contigId); 683 } 684 685 coord_t trueScaffoldLength(in id_t contigId) 686 { 687 return cast(coord_t) trueAssemblyScaffoldStructure 688 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 689 .map!(contigPart => contigPart.get!ContigSegment) 690 .find!(contigPart => contigPart.globalContigId == contigId) 691 .first 692 .length; 693 } 694 695 size_t getNumClosedGaps() 696 { 697 mixin(traceExecution); 698 699 assert(referenceGaps.intervals.length == reconstructedGaps.length); 700 logJsonDebug("closedGaps", zip(referenceGaps.intervals, reconstructedGaps) 701 .filter!(pair => isGapClosed(pair)) 702 .map!"a[1].intervals" 703 .array 704 .toJson); 705 706 return zip(referenceGaps.intervals, reconstructedGaps).count!(pair => isGapClosed(pair)); 707 } 708 709 size_t getNumPartlyClosedGaps() 710 { 711 mixin(traceExecution); 712 713 assert(referenceGaps.intervals.length == reconstructedGaps.length); 714 logJsonDebug("partlyClosedGaps", zip(referenceGaps.intervals, reconstructedGaps) 715 .filter!(pair => isGapPartlyClosed(pair)) 716 .map!"a[1].intervals" 717 .array 718 .toJson); 719 720 return zip(referenceGaps.intervals, reconstructedGaps).count!(pair => isGapPartlyClosed(pair)); 721 } 722 723 size_t getNumUnaddressedGaps() 724 { 725 mixin(traceExecution); 726 727 assert(referenceGaps.intervals.length == reconstructedGaps.length); 728 logJsonDebug("unaddressedGaps", zip(referenceGaps.intervals, reconstructedGaps) 729 .filter!(pair => isGapUnaddressed(pair)) 730 .map!"a[1].intervals" 731 .array 732 .toJson); 733 734 return zip(referenceGaps.intervals, reconstructedGaps).count!(pair => isGapUnaddressed(pair)); 735 } 736 737 bool isGapClosed(Pair)(Pair gapAndInsertion) 738 { 739 auto gap = gapAndInsertion[0]; 740 auto insertion = gapAndInsertion[1]; 741 742 return insertion.size == gap.size; 743 } 744 745 bool isGapPartlyClosed(Pair)(Pair gapAndInsertion) 746 { 747 auto gap = gapAndInsertion[0]; 748 auto insertion = gapAndInsertion[1]; 749 750 return options.minInsertionLength <= insertion.size && insertion.size < gap.size; 751 } 752 753 bool isGapUnaddressed(Pair)(Pair gapAndInsertion) 754 { 755 auto gap = gapAndInsertion[0]; 756 auto insertion = gapAndInsertion[1]; 757 758 return insertion.size < min(options.minInsertionLength, gap.size); 759 } 760 761 size_t getNumBpsInClosedGaps() 762 { 763 mixin(traceExecution); 764 765 return zip(referenceGaps.intervals, reconstructedGaps) 766 .filter!(pair => isGapClosed(pair)) 767 .map!"a[1].size" 768 .sum; 769 } 770 771 size_t getNumBpsInGaps() 772 { 773 mixin(traceExecution); 774 775 return referenceGaps 776 .intervals 777 .map!(gap => gap.size) 778 .sum; 779 } 780 781 size_t getMaximumN50() 782 { 783 mixin(traceExecution); 784 785 return trueAssemblyScaffoldStructure 786 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 787 .map!(contigPart => contigPart.peek!ContigSegment.length) 788 .array 789 .N!50(getNumBpsExpected); 790 } 791 792 size_t getInputN50() 793 { 794 mixin(traceExecution); 795 796 return mappedRegionsMask 797 .intervals 798 .map!(mappedInterval => mappedInterval.size) 799 .array 800 .N!50(getNumBpsExpected); 801 } 802 803 size_t getResultN50() 804 { 805 mixin(traceExecution); 806 807 return resultScaffoldStructure 808 .filter!(contigPart => contigPart.peek!ContigSegment !is null) 809 .map!(contigPart => contigPart.peek!ContigSegment.length) 810 .array 811 .N!50(getNumBpsExpected); 812 } 813 814 size_t getGapMedian() 815 { 816 mixin(traceExecution); 817 818 auto values = referenceGaps 819 .intervals 820 .map!(gap => gap.size) 821 .array; 822 823 if (values.length == 0) 824 return size_t.max; 825 else 826 return median(values); 827 } 828 829 size_t getClosedGapMedian() 830 { 831 mixin(traceExecution); 832 833 auto values = zip(referenceGaps.intervals, reconstructedGaps) 834 .filter!(pair => isGapClosed(pair)) 835 .map!"a[0].size" 836 .array; 837 838 if (values.length == 0) 839 return size_t.max; 840 else 841 return median(values); 842 } 843 844 ReferenceInterval getMaxClosedGap() 845 { 846 mixin(traceExecution); 847 848 return zip(referenceGaps.intervals, reconstructedGaps) 849 .filter!(pair => isGapClosed(pair)) 850 .map!"a[0]" 851 .maxElement!"a.size"(ReferenceInterval(0, 0, 0)); 852 } 853 854 Histogram!coord_t[identityLevels.length] getCorrectGapLengthHistograms() 855 { 856 mixin(traceExecution); 857 858 typeof(return) correctGapLengthHistograms; 859 860 foreach (i; 0 .. identityLevels.length) 861 { 862 correctGapLengthHistograms[i] = histogram( 863 options.bucketSize, 864 correctGapsPerIdentityLevel[i], 865 ); 866 // NOTE: for some reason bucketSize is not correctly set; force it 867 correctGapLengthHistograms[i].bucketSize = options.bucketSize; 868 } 869 870 return correctGapLengthHistograms; 871 } 872 873 Histogram!coord_t getClosedGapLengthHistogram() 874 { 875 mixin(traceExecution); 876 877 return histogram( 878 options.bucketSize, 879 zip(referenceGaps.intervals, reconstructedGaps) 880 .filter!(pair => isGapClosed(pair)) 881 .map!(pair => cast(coord_t) pair[0].size) 882 .array, 883 ); 884 } 885 886 Histogram!coord_t getGapLengthHistogram() 887 { 888 mixin(traceExecution); 889 890 return histogram( 891 options.bucketSize, 892 referenceGaps 893 .intervals 894 .map!(gap => cast(coord_t) gap.size) 895 .array, 896 ); 897 } 898 } 899 900 private struct Histogram(value_t) 901 { 902 alias Bucket = Tuple!( 903 value_t, "begin", 904 value_t, "end", 905 size_t, "count", 906 ); 907 908 value_t bucketSize; 909 size_t[] histogram; 910 alias histogram this; 911 912 this(in value_t bucketSize, in value_t[] values) 913 { 914 this.bucketSize = bucketSize + 0; 915 916 if (values.length == 0) 917 return; // empty histogram 918 919 auto largestValue = maxElement(values); 920 this.length = largestValue / bucketSize + 1; 921 922 foreach (value; values) 923 ++this[value / bucketSize]; 924 } 925 926 auto buckets() const pure nothrow 927 { 928 return histogram 929 .enumerate 930 .map!(rawBucket => Bucket( 931 bucketSize * cast(value_t) rawBucket.index, 932 bucketSize * cast(value_t) (rawBucket.index + 1), 933 rawBucket.value + 0, 934 )); 935 } 936 } 937 938 auto histogram(value_t)(in value_t bucketSize, in value_t[] values) 939 { 940 return Histogram!value_t(bucketSize, values); 941 } 942 943 private struct Stats 944 { 945 static enum identityLevels = [.999, .99, .95, .90]; 946 947 size_t numBpsExpected; 948 size_t numBpsKnown; 949 size_t numBpsResult; 950 size_t numBpsCorrect; 951 size_t numTranslocatedGaps; 952 size_t numCorrectGaps; 953 size_t numContigsExpected; 954 size_t numCorrectContigs; 955 size_t numClosedGaps; 956 size_t numPartlyClosedGaps; 957 size_t numUnaddressedGaps; 958 size_t numBpsInClosedGaps; 959 size_t numBpsInGaps; 960 size_t maximumN50; 961 size_t inputN50; 962 size_t resultN50; 963 size_t gapMedian; 964 size_t closedGapMedian; 965 ReferenceInterval maxClosedGap; 966 Histogram!coord_t[identityLevels.length] correctGapLengthHistograms; 967 Histogram!coord_t closedGapLengthHistogram; 968 Histogram!coord_t gapLengthHistogram; 969 970 string toJsonString() const 971 { 972 return [ 973 "numBpsExpected": numBpsExpected.toJson, 974 "numBpsKnown": numBpsKnown.toJson, 975 "numBpsResult": numBpsResult.toJson, 976 "numBpsCorrect": numBpsCorrect.toJson, 977 "numTranslocatedGaps": numTranslocatedGaps.toJson, 978 "numCorrectGaps": numCorrectGaps.toJson, 979 "numContigsExpected": numContigsExpected.toJson, 980 "numCorrectContigs": numCorrectContigs.toJson, 981 "numClosedGaps": numClosedGaps.toJson, 982 "numPartlyClosedGaps": numPartlyClosedGaps.toJson, 983 "numUnaddressedGaps": numUnaddressedGaps.toJson, 984 "numBpsInClosedGaps": numBpsInClosedGaps.toJson, 985 "numBpsInGaps": numBpsInGaps.toJson, 986 "maximumN50": maximumN50.toJson, 987 "inputN50": inputN50.toJson, 988 "resultN50": resultN50.toJson, 989 "gapMedian": gapMedian.toJson, 990 "closedGapMedian": closedGapMedian.toJson, 991 "maxClosedGap": maxClosedGap.toJson, 992 "gapLengthHistogram": histsToJson( 993 correctGapLengthHistograms[0], 994 correctGapLengthHistograms[1], 995 correctGapLengthHistograms[2], 996 correctGapLengthHistograms[3], 997 closedGapLengthHistogram, 998 gapLengthHistogram, 999 ), 1000 ].toJsonString; 1001 } 1002 1003 string toTabular() const 1004 { 1005 auto columnWidth = columnWidth(); 1006 auto rows =[ 1007 format!"numContigsExpected: %*d"(columnWidth, numContigsExpected), 1008 format!"numCorrectContigs: %*d"(columnWidth, numCorrectContigs), 1009 1010 format!"numTranslocatedGaps: %*d"(columnWidth, numTranslocatedGaps), 1011 format!"numClosedGaps: %*d"(columnWidth, numClosedGaps), 1012 format!"numPartlyClosedGaps: %*d"(columnWidth, numPartlyClosedGaps), 1013 format!"numUnaddressedGaps: %*d"(columnWidth, numUnaddressedGaps), 1014 format!"numCorrectGaps: %*d"(columnWidth, numCorrectGaps), 1015 1016 format!"numBpsExpected: %*d"(columnWidth, numBpsExpected), 1017 format!"numBpsKnown: %*d"(columnWidth, numBpsKnown), 1018 format!"numBpsResult: %*d"(columnWidth, numBpsResult), 1019 format!"numBpsCorrect: %*d"(columnWidth, numBpsCorrect), 1020 format!"numBpsInGaps: %*d"(columnWidth, numBpsInGaps), 1021 format!"numBpsInClosedGaps: %*d"(columnWidth, numBpsInClosedGaps), 1022 format!"numBpsInNonClosedGaps: %*d"(columnWidth, numBpsInGaps - numBpsInClosedGaps), 1023 1024 format!"maximumN50: %*d"(columnWidth, maximumN50), 1025 format!"inputN50: %*d"(columnWidth, inputN50), 1026 format!"resultN50: %*d"(columnWidth, resultN50), 1027 format!"gapMedian: %*d"(columnWidth, gapMedian), 1028 format!"closedGapMedian: %*d"(columnWidth, closedGapMedian), 1029 format!"maxClosedGap: %*s"(columnWidth, toString(maxClosedGap)), 1030 gapLengthHistogram.length > 0 1031 ? format!"gapLengthHistogram:\n \n%s"(histsToString( 1032 correctGapLengthHistograms[0], 1033 correctGapLengthHistograms[1], 1034 correctGapLengthHistograms[2], 1035 correctGapLengthHistograms[3], 1036 closedGapLengthHistogram, 1037 gapLengthHistogram, 1038 )) 1039 : null, 1040 ]; 1041 1042 return format!"%-(%s\n%)"(rows.filter!"a !is null"); 1043 } 1044 1045 const string toString(in ReferenceInterval interval) 1046 { 1047 return format!"[%d, %d) | len=%d"(interval.begin, interval.end, interval.size); 1048 } 1049 1050 const string histsToString(Hists...)(in Hists hists) 1051 { 1052 assert(hists.length >= 1); 1053 assert([hists].all!(h => h.bucketSize == hists[0].bucketSize)); 1054 auto limitsWidth = numWidth(hists[0].bucketSize * [hists].map!"a.length".maxElement); 1055 auto countWidths = [hists] 1056 .map!(h => numWidth(max(1, h.histogram.length == 0 ? 1 : maxElement(h.histogram)))) 1057 .array; 1058 1059 auto bucketsList = tupleMap!(h => h.buckets.array)(hists); 1060 auto rows = zip(StoppingPolicy.longest, bucketsList.expand) 1061 .map!(buckets => format!" %*d%s"( 1062 limitsWidth, 1063 max(tupleMap!"a.end"(buckets.expand).expand), 1064 zip(countWidths, [buckets.expand]) 1065 .map!(pair => format!" %*d"(pair[0], pair[1].count)) 1066 .join, 1067 )) 1068 .array; 1069 1070 return format!"%-(%s\n%)"(rows); 1071 } 1072 1073 const Json histsToJson(Hists...)(in Hists hists) 1074 { 1075 assert(hists.length >= 1); 1076 assert([hists].all!(h => h.bucketSize == hists[0].bucketSize)); 1077 1078 auto bucketsList = tupleMap!(h => h.buckets.array)(hists); 1079 auto rows = zip(StoppingPolicy.longest, bucketsList.expand) 1080 .map!(buckets => [ 1081 "limit": max(tupleMap!"a.end"(buckets.expand).expand).toJson, 1082 "counts": [buckets.expand] 1083 .map!"a.count" 1084 .array 1085 .toJson, 1086 ]) 1087 .array; 1088 1089 return rows.toJson; 1090 } 1091 1092 protected size_t columnWidth() const 1093 { 1094 auto numWidth = numWidth(max( 1095 numBpsExpected, 1096 numBpsKnown, 1097 numBpsResult, 1098 numTranslocatedGaps, 1099 numBpsCorrect, 1100 numCorrectGaps, 1101 numContigsExpected, 1102 numCorrectContigs, 1103 numClosedGaps, 1104 numPartlyClosedGaps, 1105 numUnaddressedGaps, 1106 numBpsInClosedGaps, 1107 numBpsInGaps, 1108 maximumN50, 1109 inputN50, 1110 resultN50, 1111 gapMedian, 1112 closedGapMedian, 1113 )); 1114 auto strWidth = max( 1115 0, // dummy 1116 toString(maxClosedGap).length, 1117 ); 1118 1119 return max(numWidth, strWidth); 1120 } 1121 1122 static protected size_t numWidth(Int)(Int i) nothrow 1123 { 1124 auto numWidth = cast(size_t) ceil(log10(i)); 1125 1126 return numWidth; 1127 } 1128 }