1 /** 2 This is the `maskRepetitiveRegions` command of `dentist`. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.maskRepetitiveRegions; 10 11 import dentist.commandline : DentistCommand, OptionsFor; 12 import dentist.common : 13 ReferenceInterval, 14 ReferenceRegion; 15 import dentist.common.alignments : 16 AlignmentChain, 17 coord_t, 18 id_t; 19 import dentist.dazzler : 20 getAlignments, 21 writeMask; 22 import dentist.util.log; 23 import std.algorithm : 24 joiner, 25 map, 26 sort, 27 uniq; 28 import std.array : appender, array; 29 import std.format : format; 30 import std.range : 31 chain, 32 only; 33 import std.range.primitives : 34 ElementType, 35 isInputRange; 36 import std.typecons : Tuple; 37 import vibe.data.json : toJson = serializeToJson; 38 39 /// Options for the `collectPileUps` command. 40 alias Options = OptionsFor!(DentistCommand.maskRepetitiveRegions); 41 42 /// Execute the `collectPileUps` command with `options`. 43 void execute(in Options options) 44 { 45 auto assessor = new RepeatMaskAssessor(options); 46 47 assessor.run(); 48 } 49 50 /// This class comprises the `collectPileUps` step of the `dentist` algorithm 51 class RepeatMaskAssessor 52 { 53 protected const Options options; 54 protected AlignmentChain[] selfAlignment; 55 protected AlignmentChain[] readsAlignment; 56 protected ReferenceRegion repetitiveRegions; 57 58 this(in ref Options options) 59 { 60 this.options = options; 61 } 62 63 void run() 64 { 65 mixin(traceExecution); 66 67 readInputs(); 68 assessRepeatStructure(); 69 writeRepeatMask(); 70 } 71 72 protected void readInputs() 73 { 74 mixin(traceExecution); 75 76 selfAlignment = getAlignments( 77 options.refDb, 78 options.selfAlignmentFile, 79 options.workdir, 80 ); 81 readsAlignment = getAlignments( 82 options.refDb, 83 options.readsDb, 84 options.readsAlignmentFile, 85 options.workdir, 86 ); 87 88 if (selfAlignment.length == 0) 89 logJsonWarn("info", "empty self alignment"); 90 if (readsAlignment.length == 0) 91 logJsonWarn("info", "empty ref vs. reads alignment"); 92 } 93 94 void assessRepeatStructure() 95 { 96 mixin(traceExecution); 97 98 auto assessmentStages = [ 99 assessmentStage( 100 "self-alignment", 101 new BadAlignmentCoverageAssessor( 102 options.coverageBoundsSelf[0], 103 options.coverageBoundsSelf[1] 104 ), 105 selfAlignment, 106 ), 107 assessmentStage( 108 "reads-alignment", 109 new BadAlignmentCoverageAssessor( 110 options.coverageBoundsReads[0], 111 options.coverageBoundsReads[1] 112 ), 113 readsAlignment, 114 ), 115 ]; 116 117 foreach (stage; assessmentStages) 118 { 119 auto repetitiveRegions = stage.assessor(stage.input); 120 121 logJsonDiagnostic( 122 "assessor", stage.name, 123 "repetitiveRegions", shouldLog(LogLevel.debug_) 124 ? repetitiveRegions.intervals.toJson 125 : toJson(null), 126 "numRepetitiveRegions", repetitiveRegions.intervals.length, 127 ); 128 129 if (shouldLog(LogLevel.debug_)) 130 { 131 auto maskName = format!"%s-%s"(options.repeatMask, stage.name); 132 133 writeMask( 134 options.refDb, 135 maskName, 136 repetitiveRegions.intervals, 137 options.workdir, 138 ); 139 } 140 141 this.repetitiveRegions |= repetitiveRegions; 142 } 143 144 logJsonDiagnostic( 145 "assessor", "finalResult", 146 "repetitiveRegions", shouldLog(LogLevel.debug_) 147 ? this.repetitiveRegions.intervals.toJson 148 : toJson(null), 149 "numRepetitiveRegions", this.repetitiveRegions.intervals.length, 150 ); 151 } 152 153 protected void writeRepeatMask() 154 { 155 mixin(traceExecution); 156 157 writeMask( 158 options.refDb, 159 options.repeatMask, 160 repetitiveRegions.intervals, 161 options.workdir, 162 ); 163 } 164 } 165 166 // see `assessmentStage` below. 167 alias MaskAssessmentStage(Assessor : RepeatAssessor) = Tuple!( 168 string, "name", 169 Assessor, "assessor", 170 const(AlignmentChain[]), "input", 171 ); 172 173 /// Construct a stage of the repeat mask assessment pipeline. 174 MaskAssessmentStage!Assessor assessmentStage(Assessor : RepeatAssessor)( 175 string name, 176 Assessor assessor, 177 const(AlignmentChain[]) input, 178 ) 179 { 180 return typeof(return)(name, assessor, input); 181 } 182 183 /** 184 A `RepeatAssessor` should generate a repeat mask derived from a set of 185 alignment chains. 186 */ 187 interface RepeatAssessor 188 { 189 ReferenceRegion opCall(in AlignmentChain[] input); 190 } 191 192 /** 193 Mask reference regions where the alignment coverage is not within set 194 limits. This helps to identify repetitive or bad quality regions. 195 */ 196 class BadAlignmentCoverageAssessor : RepeatAssessor 197 { 198 double lowerLimit; 199 double upperLimit; 200 201 /// Create an assessor with these limits. 202 this(double lowerLimit, double upperLimit) 203 { 204 this.lowerLimit = lowerLimit; 205 this.upperLimit = upperLimit; 206 } 207 208 version (unittest) 209 { 210 /** 211 Generate a list of alignment chains for testing: 212 213 ``` 214 c1 c2 c3 215 0 5 10 15 20 25 30 0 5 10 15 0 5 10 15 216 ref: [-----------------------------) [--------------) [--------------) 217 reads: . . . . . . . . . . . . . . . 218 . #1 [------------) . . . #12 [--) . . . #23 .[--). . . 219 . #2 [------------) . . . #13 [--) . . . #24 . [--) . . 220 . #3 [--------------) . . #14 [----) . . #25 . [--) . . 221 . . #4 [---------) . . #15 [----) . . #26 . [--) . . 222 . . #5 [-------------------) #16 [--------------) #27 . [--) . . 223 . . #6 [-------------------) #17 [--------------) #28 . .[--). . 224 . . #7 [----------------) #18 [--------------) #29 . . [--) . 225 . . . . #8 [---------) .#19 [---------) #30 . . [--) . 226 . . . . #9 [---------) .#20 [---------) #31 . . [--) . 227 . . . .#10 [---------) .#21 [---------) #32 . . [--) . 228 . . . . #11 [-----) . #22 [-----) #33 . . .[--). 229 . . . . . . . . . . . . . . . 230 mask: [====) [=======) [=========) [==) [=========) [==) . . [==) 231 . . . . . . . . . . . . . . . 232 cov: ^ . . . . . . . . . . . . . . 233 | . . . . . . . . . . . . . . 234 | . . +----+ . +----+. +-+ . +----+. . . . . 235 | . +--+####| +---+####|. |#| +---+####|. . . . . 236 5 |.........|#######+-+########|. |#+--+########|. . . . . 237 | . |##################|. |#############|. . . . . 238 | +----+##################|. |#############|. . +-------+ . 239 | |#######################|. |#############|. . ++#######++ . 240 | |#######################|. |#############|. .++#########++ . 241 0 +----+-----------------------+--------+-------------+--------+-----------+----> 242 ``` 243 */ 244 private static AlignmentChain[] getTestAlignments() 245 { 246 with (AlignmentChain) with (LocalAlignment) 247 { 248 id_t alignmentChainId = 0; 249 id_t contReadId = 0; 250 AlignmentChain getDummyAlignment(id_t contigId, 251 coord_t contigLength, coord_t beginIdx, coord_t endIdx) 252 { 253 return AlignmentChain( 254 ++alignmentChainId, 255 Contig(contigId, contigLength), 256 Contig(alignmentChainId, endIdx), 257 emptyFlags, 258 [ 259 LocalAlignment( 260 Locus(beginIdx, beginIdx + 1), 261 Locus(0, 1), 262 0, 263 ), 264 LocalAlignment( 265 Locus(endIdx - 1, endIdx), 266 Locus(2, 3), 267 0, 268 ), 269 ], 270 ); 271 } 272 273 return [ 274 getDummyAlignment(1, 30, 5, 18), // #1 275 getDummyAlignment(1, 30, 5, 18), // #2 276 getDummyAlignment(1, 30, 5, 20), // #3 277 getDummyAlignment(1, 30, 10, 20), // #4 278 getDummyAlignment(1, 30, 10, 30), // #5 279 getDummyAlignment(1, 30, 10, 30), // #6 280 getDummyAlignment(1, 30, 13, 30), // #7 281 getDummyAlignment(1, 30, 20, 30), // #8 282 getDummyAlignment(1, 30, 20, 30), // #9 283 getDummyAlignment(1, 30, 20, 30), // #10 284 getDummyAlignment(1, 30, 24, 30), // #11 285 getDummyAlignment(2, 15, 0, 3), // #12 286 getDummyAlignment(2, 15, 0, 3), // #13 287 getDummyAlignment(2, 15, 0, 5), // #14 288 getDummyAlignment(2, 15, 0, 5), // #15 289 getDummyAlignment(2, 15, 0, 15), // #16 290 getDummyAlignment(2, 15, 0, 15), // #17 291 getDummyAlignment(2, 15, 0, 15), // #18 292 getDummyAlignment(2, 15, 5, 15), // #19 293 getDummyAlignment(2, 15, 5, 15), // #20 294 getDummyAlignment(2, 15, 5, 15), // #21 295 getDummyAlignment(2, 15, 9, 15), // #22 296 getDummyAlignment(3, 15, 1, 4), // #23 297 getDummyAlignment(3, 15, 2, 5), // #24 298 getDummyAlignment(3, 15, 3, 6), // #25 299 getDummyAlignment(3, 15, 4, 7), // #26 300 getDummyAlignment(3, 15, 5, 8), // #27 301 getDummyAlignment(3, 15, 6, 9), // #28 302 getDummyAlignment(3, 15, 7, 10), // #29 303 getDummyAlignment(3, 15, 8, 11), // #30 304 getDummyAlignment(3, 15, 9, 12), // #31 305 getDummyAlignment(3, 15, 10, 13), // #32 306 getDummyAlignment(3, 15, 11, 14), // #33 307 ]; 308 } 309 } 310 } 311 312 /// Apply the assessor to the given set of alignment. 313 override ReferenceRegion opCall(const(AlignmentChain[]) alignments) 314 { 315 if (alignments.length == 0) 316 { 317 return ReferenceRegion(); 318 } 319 320 static enum OK = CoverageZone.ok; 321 auto maskAcc = appender!(ReferenceInterval[]); 322 auto masker = Masker(); 323 auto changeEvents = coverageChanges(alignments); 324 auto lastEvent = changeEvents.front; 325 326 debug logJsonDebug("changeEvents", changeEvents.array.toJson); 327 foreach (event; changeEvents) 328 { 329 auto currentZone = coverageZone(event.currentCoverage); 330 auto newZone = coverageZone(event.newCoverage); 331 332 if (masker.isMasking && event.contigId != lastEvent.contigId) 333 { 334 maskAcc ~= masker.finish(lastEvent.position); 335 } 336 337 if ( 338 !masker.isMasking && 339 (newZone != OK || (currentZone == OK && currentZone != newZone)) 340 ) 341 { 342 masker.start(event.contigId, event.position); 343 } 344 else if (masker.isMasking && currentZone != OK && newZone == OK) 345 { 346 maskAcc ~= masker.finish(event.position); 347 } 348 349 lastEvent = event; 350 } 351 352 if (masker.isMasking) 353 { 354 maskAcc ~= masker.finish(lastEvent.position); 355 } 356 357 return ReferenceRegion(maskAcc.data); 358 } 359 360 /// 361 unittest 362 { 363 auto alignments = getTestAlignments(); 364 alias CoverageChange = CoverageChangeRange.CoverageChange; 365 366 auto assessor = new BadAlignmentCoverageAssessor(3, 5); 367 368 assert(assessor(alignments) == ReferenceRegion([ 369 ReferenceInterval(1, 0, 5), 370 ReferenceInterval(1, 10, 18), 371 ReferenceInterval(1, 20, 30), 372 ReferenceInterval(2, 0, 3), 373 ReferenceInterval(2, 5, 15), 374 ReferenceInterval(3, 0, 3), 375 ReferenceInterval(3, 12, 15), 376 ])); 377 } 378 379 private CoverageZone coverageZone(in int coverage) pure nothrow 380 { 381 return coverage < lowerLimit 382 ? CoverageZone.low 383 : coverage > upperLimit 384 ? CoverageZone.high 385 : CoverageZone.ok; 386 } 387 } 388 389 private enum CoverageZone 390 { 391 low, 392 ok, 393 high, 394 } 395 396 private struct Masker 397 { 398 private bool _isMasking = false; 399 private size_t contigId; 400 private size_t maskStart; 401 402 void start(in size_t contigId, in size_t maskStart) pure nothrow 403 { 404 this._isMasking = true; 405 this.contigId = contigId; 406 this.maskStart = maskStart; 407 } 408 409 ReferenceInterval finish(in size_t maskEnd) pure nothrow 410 { 411 this._isMasking = false; 412 413 return ReferenceInterval( 414 this.contigId, 415 this.maskStart, 416 maskEnd, 417 ); 418 } 419 420 @property bool isMasking() const pure nothrow 421 { 422 return this._isMasking; 423 } 424 } 425 426 /// Transforms a range of alignment chains into a range of coverage 427 /// change events. 428 private struct CoverageChangeRange 429 { 430 static alias AlignmentEvent = Tuple! 431 ( 432 size_t, "contigId", 433 size_t, "position", 434 int, "diff", 435 ); 436 437 static struct CoverageChange 438 { 439 size_t contigId; 440 size_t position; 441 int currentCoverage; 442 int newCoverage; 443 444 bool hasChanged() const pure nothrow 445 { 446 return currentCoverage != newCoverage; 447 } 448 } 449 450 AlignmentEvent[] alignmentEvents; 451 size_t currentEventIdx = 0; 452 CoverageChange _front; 453 454 private this(AlignmentEvent[] alignmentEvents) 455 { 456 this.alignmentEvents = alignmentEvents; 457 if (!empty) 458 { 459 // Advance to first event. 460 popFront(); 461 } 462 } 463 464 private static CoverageChangeRange create(Range)(Range alignments) 465 if (isInputRange!Range && is(ElementType!Range : const(AlignmentChain))) 466 { 467 if (alignments.length == 0) 468 { 469 return CoverageChangeRange(); 470 } 471 472 auto alignmentEvents = alignments 473 .map!(alignment => only( 474 AlignmentEvent(alignment.contigA.id, alignment.first.contigA.begin, 1), 475 AlignmentEvent(alignment.contigA.id, alignment.last.contigA.end, -1), 476 )) 477 .joiner; 478 auto contigBoundaryEvents = alignments 479 .map!"a.contigA" 480 .uniq 481 .map!(contig => only( 482 AlignmentEvent(contig.id, 0, 0), 483 AlignmentEvent(contig.id, contig.length, 0), 484 )) 485 .joiner; 486 auto changeEvents = chain(alignmentEvents, contigBoundaryEvents).array; 487 changeEvents.sort(); 488 489 return CoverageChangeRange(changeEvents); 490 } 491 492 void popFront() 493 { 494 _front.currentCoverage = _front.newCoverage; 495 496 if (currentEventIdx == alignmentEvents.length) 497 { 498 // There is no more data; just pop the front and return; 499 assert(_front.currentCoverage == 0, "coverage should drop to zero in the end"); 500 ++currentEventIdx; 501 502 return; 503 } 504 505 auto currentEvent = alignmentEvents[currentEventIdx]; 506 auto eventAcc = currentEvent; 507 eventAcc.diff = 0; 508 509 // Collect all events for one position. 510 for (; currentEventIdx < alignmentEvents.length; ++currentEventIdx) 511 { 512 currentEvent = alignmentEvents[currentEventIdx]; 513 514 if ( 515 eventAcc.contigId != currentEvent.contigId || 516 eventAcc.position != currentEvent.position 517 ) 518 { 519 break; 520 } 521 522 eventAcc.diff += currentEvent.diff; 523 } 524 525 assert(eventAcc.contigId == _front.contigId || _front.currentCoverage == 0, 526 "coverage should drop to zero between contigs"); 527 _front.contigId = eventAcc.contigId; 528 _front.position = eventAcc.position; 529 _front.newCoverage = _front.currentCoverage + eventAcc.diff; 530 } 531 532 @property bool empty() 533 { 534 return currentEventIdx > alignmentEvents.length; 535 } 536 537 @property CoverageChange front() 538 { 539 return _front; 540 } 541 } 542 543 CoverageChangeRange coverageChanges(Range)(Range alignments) 544 { 545 return CoverageChangeRange.create(alignments); 546 } 547 548 unittest 549 { 550 import std.algorithm : equal; 551 552 auto alignments = BadAlignmentCoverageAssessor.getTestAlignments(); 553 554 alias CoverageChange = CoverageChangeRange.CoverageChange; 555 556 assert(coverageChanges(alignments).equal([ 557 CoverageChange(1, 0, 0, 0), 558 CoverageChange(1, 5, 0, 3), 559 CoverageChange(1, 10, 3, 6), 560 CoverageChange(1, 13, 6, 7), 561 CoverageChange(1, 18, 7, 5), 562 CoverageChange(1, 20, 5, 6), 563 CoverageChange(1, 24, 6, 7), 564 CoverageChange(1, 30, 7, 0), 565 CoverageChange(2, 0, 0, 7), 566 CoverageChange(2, 3, 7, 5), 567 CoverageChange(2, 5, 5, 6), 568 CoverageChange(2, 9, 6, 7), 569 CoverageChange(2, 15, 7, 0), 570 CoverageChange(3, 0, 0, 0), 571 CoverageChange(3, 1, 0, 1), 572 CoverageChange(3, 2, 1, 2), 573 CoverageChange(3, 3, 2, 3), 574 CoverageChange(3, 4, 3, 3), 575 CoverageChange(3, 5, 3, 3), 576 CoverageChange(3, 6, 3, 3), 577 CoverageChange(3, 7, 3, 3), 578 CoverageChange(3, 8, 3, 3), 579 CoverageChange(3, 9, 3, 3), 580 CoverageChange(3, 10, 3, 3), 581 CoverageChange(3, 11, 3, 3), 582 CoverageChange(3, 12, 3, 2), 583 CoverageChange(3, 13, 2, 1), 584 CoverageChange(3, 14, 1, 0), 585 CoverageChange(3, 15, 0, 0), 586 ])); 587 }