1 /** 2 This is he algorithm for building pile ups. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.collectPileUps.pileups; 10 11 import dentist.common.alignments : 12 AlignmentChain, 13 AlignmentLocationSeed, 14 arithmetic_t, 15 id_t, 16 coord_t, 17 getType, 18 isValid, 19 PileUp, 20 ReadAlignment, 21 SeededAlignment, 22 to; 23 import dentist.common.binio : ArrayStorage; 24 import dentist.common.scaffold : 25 buildScaffold, 26 concatenatePayloads, 27 discardAmbiguousJoins, 28 isGap, 29 Join, 30 mergeExtensionsWithGaps, 31 Scaffold; 32 import dentist.util.algorithm : orderLexicographically; 33 import dentist.util.math : filterEdges; 34 import dentist.util.log; 35 import std.algorithm : 36 any, 37 canFind, 38 chunkBy, 39 equal, 40 filter, 41 joiner, 42 map, 43 min, 44 sort, 45 SwapStrategy; 46 import std.algorithm : equal; 47 import std.array : array; 48 import std.conv : to; 49 import std.range : chain, iota, only, slide, walkLength, zip; 50 import std.typecons : No; 51 import vibe.data.json : toJson = serializeToJson; 52 53 54 private auto collectPileUps(Scaffold!(ReadAlignment[]) scaffold) 55 { 56 return scaffold 57 .edges 58 .filter!"a.payload.length > 0" 59 .map!"a.payload" 60 .filter!(pileUp => pileUp.isValid); 61 } 62 63 private void debugLogPileUps(string state, Scaffold!(ReadAlignment[]) scaffold) 64 { 65 logJsonDebug( 66 "state", state, 67 "pileUps", collectPileUps(scaffold) 68 .map!(pileUp => [ 69 "type": pileUp.getType.to!string.toJson, 70 "readAlignments": pileUp.map!"a[]".array.toJson, 71 ].toJson) 72 .array 73 .toJson, 74 ); 75 } 76 77 PileUp[] build(Options)( 78 in size_t numReferenceContigs, 79 AlignmentChain[] candidates, 80 in Options options, 81 ) 82 { 83 alias isSameRead = (a, b) => a.contigB.id == b.contigB.id; 84 alias Payload = ReadAlignment[]; 85 alias ReadAlignmentJoin = Join!Payload; 86 87 candidates.sort!("a.contigB.id < b.contigB.id", SwapStrategy.stable); 88 89 auto readAlignmentJoins = candidates 90 .filter!"!a.flags.disabled" 91 .chunkBy!isSameRead 92 .map!collectReadAlignments 93 .joiner 94 .filter!"a.isValid" 95 .map!"a.getInOrder()" 96 .map!(to!ReadAlignmentJoin); 97 auto alignmentsScaffold = buildScaffold!(concatenatePayloads!Payload, Payload)(numReferenceContigs + 0, readAlignmentJoins); 98 debugLogPileUps("raw", alignmentsScaffold); 99 alignmentsScaffold = alignmentsScaffold.discardAmbiguousJoins!Payload(options.bestPileUpMargin); 100 debugLogPileUps("unambiguous", alignmentsScaffold); 101 alignmentsScaffold.filterEdges!(e => !e.isGap || e.payload.length >= options.minSpanningReads); 102 debugLogPileUps("minSpanningEnforced", alignmentsScaffold); 103 alignmentsScaffold = alignmentsScaffold.mergeExtensionsWithGaps!("a ~ b", Payload); 104 debugLogPileUps("extensionsMerged", alignmentsScaffold); 105 auto pileUps = collectPileUps(alignmentsScaffold).array; 106 107 return pileUps; 108 } 109 110 unittest 111 { 112 // FIXME add test case with contig spanning read 113 // c1 c2 c3 114 // 0 5 10 15 0 5 10 15 0 5 10 15 115 // ref: |---->---->----|.........|---->---->----|.........|---->---->----| 116 // reads: . . . : . : . . : . : . . . 117 // . #1 |---->---->--| . : . . : . : . . . 118 // . #2 |----<----<--| . : . . : . : . . . 119 // . #3 |---->---->----| : . . : . : . . . 120 // . . #4 |---->----| : . . : . : . . . 121 // . . #5 |---->---->---->----| . : . : . . . 122 // . . #6 |----<----<----<----| . : . : . . . 123 // . . #7 |->---->---->---->--| . : . : . . . 124 // . . . : #8 |---->---->--| . : . : . . . 125 // . . . : #9 |----<----<--| . : . : . . . 126 // . . . :#10 |---->---->----| : . : . . . 127 // . . . : #11 |----->----| : . : . . . 128 // . . . : . : . . : . : . . . 129 // . . . : . :#12 |---->---->--| . : . . . 130 // . . . : . :#13 |----<----<--| . : . . . 131 // . . . : . :#14 |---->---->----| : . . . 132 // . . . : . : .#15 |---->----| : . . . 133 // . . . : . : .#16 |---->---->---->----| . . 134 // . . . : . : .#17 |----<----<----<----| . . 135 // . . . : . : . #18 |->---->---->---->--| . . 136 // . . . : . : . . :#19 |---->---->--| . . 137 // . . . : . : . . :#20 |----<----<--| . . 138 // . . . : . : . . :#21 |---->---->----| . 139 // . . . : . : . . : #22 |----->----| . 140 import std.algorithm : clamp; 141 142 with (AlignmentChain) with (LocalAlignment) 143 { 144 static struct Options 145 { 146 size_t minSpanningReads = 1; 147 double bestPileUpMargin = 1.0; 148 } 149 150 id_t alignmentChainId = 0; 151 id_t contReadId = 0; 152 ReadAlignment getDummyRead(id_t beginContigId, arithmetic_t beginIdx, 153 id_t endContigId, arithmetic_t endIdx, Complement complement) 154 { 155 static enum contigLength = 16; 156 static enum gapLength = 9; 157 static enum numDiffs = 0; 158 159 alignmentChainId += 2; 160 auto readId = ++contReadId; 161 auto readLength = beginContigId == endContigId 162 ? endIdx - beginIdx 163 : contigLength - beginIdx + gapLength + endIdx; 164 auto flags = complement ? Flags(Flag.complement) : emptyFlags; 165 coord_t firstReadBeginIdx; 166 coord_t firstReadEndIdx; 167 168 if (beginIdx < 0) 169 { 170 firstReadBeginIdx = readLength - endIdx; 171 firstReadEndIdx = readLength; 172 } 173 else 174 { 175 firstReadBeginIdx = 0; 176 firstReadEndIdx = contigLength - beginIdx; 177 } 178 179 beginIdx = clamp(beginIdx, 0, contigLength); 180 endIdx = clamp(endIdx, 0, contigLength); 181 182 if (beginContigId == endContigId) 183 { 184 return ReadAlignment(SeededAlignment.from(AlignmentChain( 185 alignmentChainId - 2, 186 Contig(beginContigId, contigLength), 187 Contig(readId, readLength), 188 flags, 189 [ 190 LocalAlignment( 191 Locus(beginIdx, beginIdx + 1), 192 Locus(firstReadBeginIdx, firstReadBeginIdx + 1), 193 numDiffs, 194 ), 195 LocalAlignment( 196 Locus(endIdx - 1, endIdx), 197 Locus(firstReadEndIdx - 1, firstReadEndIdx), 198 numDiffs, 199 ), 200 ], 201 )).front); 202 } 203 else 204 { 205 auto secondReadBeginIdx = readLength - endIdx; 206 auto secondReadEndIdx = readLength; 207 208 return ReadAlignment( 209 SeededAlignment.from(AlignmentChain( 210 alignmentChainId - 2, 211 Contig(beginContigId, contigLength), 212 Contig(readId, readLength), 213 flags, 214 [ 215 LocalAlignment( 216 Locus(beginIdx, beginIdx + 1), 217 Locus(firstReadBeginIdx, firstReadBeginIdx + 1), 218 numDiffs, 219 ), 220 LocalAlignment( 221 Locus(contigLength - 1, contigLength), 222 Locus(firstReadEndIdx - 1, firstReadEndIdx), 223 numDiffs, 224 ), 225 ], 226 )).front, 227 SeededAlignment.from(AlignmentChain( 228 alignmentChainId - 1, 229 Contig(endContigId, contigLength), 230 Contig(readId, readLength), 231 flags, 232 [ 233 LocalAlignment( 234 Locus(0, 1), 235 Locus(secondReadBeginIdx, secondReadBeginIdx + 1), 236 numDiffs, 237 ), 238 LocalAlignment( 239 Locus(endIdx - 1, endIdx), 240 Locus(secondReadEndIdx - 1, secondReadEndIdx), 241 numDiffs, 242 ), 243 ], 244 )).front, 245 ); 246 } 247 } 248 249 id_t c1 = 1; 250 id_t c2 = 2; 251 id_t c3 = 3; 252 auto pileUps = [ 253 [ 254 getDummyRead(c1, 5, c1, 18, Complement.no), // #1 255 getDummyRead(c1, 5, c1, 18, Complement.yes), // #2 256 getDummyRead(c1, 5, c1, 20, Complement.no), // #3 257 getDummyRead(c1, 10, c1, 20, Complement.no), // #4 258 getDummyRead(c1, 10, c2, 5, Complement.no), // #5 259 getDummyRead(c1, 10, c2, 5, Complement.yes), // #6 260 getDummyRead(c1, 13, c2, 8, Complement.no), // #7 261 getDummyRead(c2, -5, c2, 8, Complement.no), // #8 262 getDummyRead(c2, -5, c2, 8, Complement.yes), // #9 263 getDummyRead(c2, -5, c2, 10, Complement.no), // #10 264 getDummyRead(c2, -1, c2, 10, Complement.no), // #11 265 ], 266 [ 267 getDummyRead(c2, 5, c2, 18, Complement.no), // #12 268 getDummyRead(c2, 5, c2, 18, Complement.yes), // #13 269 getDummyRead(c2, 5, c2, 20, Complement.no), // #14 270 getDummyRead(c2, 10, c2, 20, Complement.no), // #15 271 getDummyRead(c2, 10, c3, 5, Complement.no), // #16 272 getDummyRead(c2, 10, c3, 5, Complement.yes), // #17 273 getDummyRead(c2, 13, c3, 8, Complement.no), // #18 274 getDummyRead(c3, -5, c3, 8, Complement.no), // #19 275 getDummyRead(c3, -5, c3, 8, Complement.yes), // #20 276 getDummyRead(c3, -5, c3, 10, Complement.no), // #21 277 getDummyRead(c3, -1, c3, 10, Complement.no), // #22 278 ], 279 ]; 280 auto alignmentChains = pileUps 281 .joiner 282 .map!"a[]" 283 .joiner 284 .map!"a.alignment" 285 .array; 286 auto computedPileUps = build(3, alignmentChains, Options()); 287 288 foreach (pileUp, computedPileUp; zip(pileUps, computedPileUps)) 289 { 290 // pileUp is subset of computedPileUp 291 foreach (readAlignment; pileUp) 292 { 293 assert(computedPileUp.canFind(readAlignment)); 294 } 295 // computedPileUp is subset of pileUp 296 foreach (readAlignment; computedPileUp) 297 { 298 assert(pileUp.canFind(readAlignment)); 299 } 300 } 301 } 302 } 303 304 305 306 // Not meant for public usage. 307 ReadAlignment[] collectReadAlignments(Chunk)(Chunk sameReadAlignments) 308 { 309 alias beginRelToContigB = (alignment) => alignment.complement 310 ? alignment.contigB.length - alignment.last.contigB.end 311 : alignment.first.contigB.begin; 312 alias endRelToContigB = (alignment) => alignment.complement 313 ? alignment.contigB.length - alignment.first.contigB.begin 314 : alignment.last.contigB.end; 315 alias seedRelToContigB = (alignment) => alignment.complement 316 ? 0 - alignment.seed 317 : 0 + alignment.seed; 318 alias orderByLocusAndSeed = orderLexicographically!( 319 SeededAlignment, 320 beginRelToContigB, 321 endRelToContigB, 322 seedRelToContigB, 323 ); 324 alias seededPartsOfOneAlignment = (a, b) => a.alignment == b.alignment && a.seed != b.seed; 325 alias shareReadSequence = (a, b) => endRelToContigB(a) > beginRelToContigB(b); 326 alias emptyRange = () => cast(ReadAlignment[])[]; 327 328 auto seededAlignments = sameReadAlignments.map!(SeededAlignment.from).joiner.array; 329 seededAlignments.sort!orderByLocusAndSeed; 330 331 if (seededAlignments.length == 0) 332 { 333 return emptyRange(); 334 } 335 336 // Validate seeded alignments and discard accordingly. 337 foreach (saPair; seededAlignments.slide!(No.withPartial)(2)) 338 { 339 if (shareReadSequence(saPair[0], saPair[1]) 340 && !seededPartsOfOneAlignment(saPair[0], saPair[1])) 341 { 342 // NOTE this discards reads supporting ambiguous joins: 343 // 344 // ``` 345 // contig A1: |---------| 346 // read: |---------| 347 // contig A2 & C: |---------| |---------| 348 // ``` 349 return emptyRange(); 350 } 351 } 352 353 // Collect read alignments 354 bool startWithExtension = beginRelToContigB(seededAlignments[0]) > 0; 355 size_t sliceStart = startWithExtension ? 1 : 0; 356 357 auto remainingReadAlignments = iota(sliceStart, seededAlignments.length, 2) 358 .map!(i => ReadAlignment(seededAlignments[i .. min(i + 2, $)])); 359 auto readAlignments = startWithExtension 360 ? chain( 361 only(ReadAlignment(seededAlignments[0 .. 1])), 362 remainingReadAlignments, 363 ).array 364 : remainingReadAlignments.array; 365 366 if (readAlignments.any!"!a.isValid") 367 { 368 // If one read alignment is invalid we should not touch this read at all. 369 return emptyRange(); 370 } 371 372 return readAlignments; 373 } 374 375 unittest 376 { 377 alias Contig = AlignmentChain.Contig; 378 alias Flags = AlignmentChain.Flags; 379 enum emptyFlags = AlignmentChain.emptyFlags; 380 enum complement = AlignmentChain.Flag.complement; 381 alias LocalAlignment = AlignmentChain.LocalAlignment; 382 alias Locus = LocalAlignment.Locus; 383 alias Seed = AlignmentLocationSeed; 384 385 { 386 // Case 1: 387 // 388 // |-->-->-->--| |-->-->-->--| |-->-->-->--| 389 // 390 // |-----> <-----------> <-----| 391 auto alignmentChains = [ 392 AlignmentChain( 393 0, 394 Contig(1, 20), 395 Contig(1, 60), 396 emptyFlags, 397 [LocalAlignment( 398 Locus(10, 20), 399 Locus(0, 10), 400 )] 401 ), 402 AlignmentChain( 403 1, 404 Contig(2, 20), 405 Contig(1, 60), 406 emptyFlags, 407 [LocalAlignment( 408 Locus(0, 20), 409 Locus(20, 40), 410 )] 411 ), 412 AlignmentChain( 413 2, 414 Contig(3, 20), 415 Contig(1, 60), 416 emptyFlags, 417 [LocalAlignment( 418 Locus(0, 10), 419 Locus(50, 60), 420 )] 421 ), 422 ]; 423 assert(collectReadAlignments(alignmentChains).equal([ 424 ReadAlignment([ 425 SeededAlignment(alignmentChains[0], Seed.back), 426 SeededAlignment(alignmentChains[1], Seed.front), 427 ]), 428 ReadAlignment([ 429 SeededAlignment(alignmentChains[1], Seed.back), 430 SeededAlignment(alignmentChains[2], Seed.front), 431 ]), 432 ])); 433 } 434 { 435 // Case 2: 436 // 437 // |-->-->-->--| |--<--<--<--| |-->-->-->--| 438 // 439 // |-----> <-----------> <-----| 440 auto alignmentChains = [ 441 AlignmentChain( 442 0, 443 Contig(1, 20), 444 Contig(1, 60), 445 emptyFlags, 446 [LocalAlignment( 447 Locus(10, 20), 448 Locus(0, 10), 449 )] 450 ), 451 AlignmentChain( 452 1, 453 Contig(2, 20), 454 Contig(1, 60), 455 Flags(complement), 456 [LocalAlignment( 457 Locus(0, 20), 458 Locus(20, 40), 459 )] 460 ), 461 AlignmentChain( 462 2, 463 Contig(3, 20), 464 Contig(1, 60), 465 emptyFlags, 466 [LocalAlignment( 467 Locus(0, 10), 468 Locus(50, 60), 469 )] 470 ), 471 ]; 472 assert(collectReadAlignments(alignmentChains).equal([ 473 ReadAlignment([ 474 SeededAlignment(alignmentChains[0], Seed.back), 475 SeededAlignment(alignmentChains[1], Seed.back), 476 ]), 477 ReadAlignment([ 478 SeededAlignment(alignmentChains[1], Seed.front), 479 SeededAlignment(alignmentChains[2], Seed.front), 480 ]), 481 ])); 482 } 483 { 484 // Case 3: 485 // 486 // |--<--<--<--| |-->-->-->--| 487 // 488 // <-----------> <-----| 489 auto alignmentChains = [ 490 AlignmentChain( 491 1, 492 Contig(2, 20), 493 Contig(1, 60), 494 Flags(complement), 495 [LocalAlignment( 496 Locus(0, 20), 497 Locus(20, 40), 498 )] 499 ), 500 AlignmentChain( 501 2, 502 Contig(3, 20), 503 Contig(1, 60), 504 emptyFlags, 505 [LocalAlignment( 506 Locus(0, 10), 507 Locus(50, 60), 508 )] 509 ), 510 ]; 511 assert(collectReadAlignments(alignmentChains).equal([ 512 ReadAlignment([ 513 SeededAlignment(alignmentChains[0], Seed.back), 514 ]), 515 ReadAlignment([ 516 SeededAlignment(alignmentChains[0], Seed.front), 517 SeededAlignment(alignmentChains[1], Seed.front), 518 ]), 519 ])); 520 } 521 { 522 // Case 4: 523 // 524 // |-->-->-->--| |--<--<--<--| 525 // 526 // |-----> <-----------> 527 auto alignmentChains = [ 528 AlignmentChain( 529 0, 530 Contig(1, 20), 531 Contig(1, 60), 532 emptyFlags, 533 [LocalAlignment( 534 Locus(10, 20), 535 Locus(0, 10), 536 )] 537 ), 538 AlignmentChain( 539 1, 540 Contig(2, 20), 541 Contig(1, 60), 542 Flags(complement), 543 [LocalAlignment( 544 Locus(0, 20), 545 Locus(20, 40), 546 )] 547 ), 548 ]; 549 assert(collectReadAlignments(alignmentChains).equal([ 550 ReadAlignment([ 551 SeededAlignment(alignmentChains[0], Seed.back), 552 SeededAlignment(alignmentChains[1], Seed.back), 553 ]), 554 ReadAlignment([ 555 SeededAlignment(alignmentChains[1], Seed.front), 556 ]), 557 ])); 558 } 559 { 560 // Case 5: 561 // 562 // |--<--<--<--| 563 // 564 // <-----------> 565 auto alignmentChains = [ 566 AlignmentChain( 567 1, 568 Contig(2, 20), 569 Contig(1, 60), 570 Flags(complement), 571 [LocalAlignment( 572 Locus(0, 20), 573 Locus(20, 40), 574 )] 575 ), 576 ]; 577 assert(collectReadAlignments(alignmentChains).equal([ 578 ReadAlignment([ 579 SeededAlignment(alignmentChains[0], Seed.back), 580 ]), 581 ReadAlignment([ 582 SeededAlignment(alignmentChains[0], Seed.front), 583 ]), 584 ])); 585 } 586 { 587 // Case 6: 588 // 589 // |-->-->-->--| 590 // |-->-->-->--| |--<--<--<--| |-->-->-->--| 591 // 592 // |-----> <-----------> <-----| 593 auto alignmentChains = [ 594 AlignmentChain( 595 0, 596 Contig(1, 20), 597 Contig(1, 60), 598 emptyFlags, 599 [LocalAlignment( 600 Locus(10, 20), 601 Locus(0, 10), 602 )] 603 ), 604 AlignmentChain( 605 1, 606 Contig(2, 20), 607 Contig(1, 60), 608 Flags(complement), 609 [LocalAlignment( 610 Locus(0, 20), 611 Locus(20, 40), 612 )] 613 ), 614 AlignmentChain( 615 2, 616 Contig(3, 20), 617 Contig(1, 60), 618 emptyFlags, 619 [LocalAlignment( 620 Locus(0, 10), 621 Locus(50, 60), 622 )] 623 ), 624 AlignmentChain( 625 3, 626 Contig(4, 20), 627 Contig(1, 60), 628 emptyFlags, 629 [LocalAlignment( 630 Locus(10, 20), 631 Locus(0, 10), 632 )] 633 ), 634 ]; 635 assert(collectReadAlignments(alignmentChains).length == 0); 636 } 637 { 638 // Case 7: 639 // 640 // |-->-->-->--| 641 // |-->-->-->--| |--<--<--<--| |-->-->-->--| 642 // 643 // |-----> <-----------> <-----| 644 auto alignmentChains = [ 645 AlignmentChain( 646 0, 647 Contig(1, 20), 648 Contig(1, 60), 649 emptyFlags, 650 [LocalAlignment( 651 Locus(10, 20), 652 Locus(0, 10), 653 )] 654 ), 655 AlignmentChain( 656 1, 657 Contig(2, 20), 658 Contig(1, 60), 659 Flags(complement), 660 [LocalAlignment( 661 Locus(0, 20), 662 Locus(20, 40), 663 )] 664 ), 665 AlignmentChain( 666 2, 667 Contig(3, 20), 668 Contig(1, 60), 669 emptyFlags, 670 [LocalAlignment( 671 Locus(0, 10), 672 Locus(50, 60), 673 )] 674 ), 675 AlignmentChain( 676 3, 677 Contig(4, 20), 678 Contig(1, 60), 679 emptyFlags, 680 [LocalAlignment( 681 Locus(0, 20), 682 Locus(20, 40), 683 )] 684 ), 685 ]; 686 assert(collectReadAlignments(alignmentChains).length == 0); 687 } 688 { 689 // Case 8: 690 // 691 // |-->-->-->--| 692 // |-->-->-->--| |--<--<--<--| |-->-->-->--| 693 // 694 // |-----> <-----------> <-----| 695 auto alignmentChains = [ 696 AlignmentChain( 697 0, 698 Contig(1, 20), 699 Contig(1, 60), 700 emptyFlags, 701 [LocalAlignment( 702 Locus(10, 20), 703 Locus(0, 10), 704 )] 705 ), 706 AlignmentChain( 707 1, 708 Contig(2, 20), 709 Contig(1, 60), 710 Flags(complement), 711 [LocalAlignment( 712 Locus(0, 20), 713 Locus(20, 40), 714 )] 715 ), 716 AlignmentChain( 717 2, 718 Contig(3, 20), 719 Contig(1, 60), 720 emptyFlags, 721 [LocalAlignment( 722 Locus(0, 10), 723 Locus(50, 60), 724 )] 725 ), 726 AlignmentChain( 727 3, 728 Contig(4, 30), 729 Contig(1, 60), 730 emptyFlags, 731 [LocalAlignment( 732 Locus(0, 30), 733 Locus(30, 60), 734 )] 735 ), 736 ]; 737 assert(collectReadAlignments(alignmentChains).length == 0); 738 } 739 }