1 /** 2 This package contains methods for cropping a pile up. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.processPileUps.cropper; 10 11 import dentist.common : 12 ReadInterval, 13 ReferenceRegion, 14 ReferencePoint, 15 to; 16 import dentist.common.alignments : 17 AlignmentChain, 18 AlignmentLocationSeed, 19 coord_t, 20 getType, 21 PileUp, 22 ReadAlignment, 23 SeededAlignment, 24 trace_point_t; 25 import dentist.dazzler : buildDamFile, getFastaSequences; 26 import dentist.util.algorithm : sliceBy; 27 import dentist.util.log; 28 import dentist.util.math : ceil, ceildiv, RoundingMode; 29 import dentist.util.region : min, sup; 30 import std.algorithm : 31 all, 32 canFind, 33 copy, 34 joiner, 35 filter, 36 fold, 37 map, 38 sort, 39 sum, 40 swap; 41 import std.array : array, minimallyInitializedArray; 42 import std.conv : to; 43 import std.format : format; 44 import std.range : 45 chain, 46 chunks, 47 iota, 48 retro, 49 zip; 50 import std.typecons : tuple; 51 import vibe.data.json : toJson = serializeToJson; 52 53 54 struct CropOptions 55 { 56 string readsDb; 57 trace_point_t tracePointDistance; 58 string workdir; 59 } 60 61 auto cropPileUp(PileUp pileUp, in ReferenceRegion mask, in CropOptions options) 62 { 63 auto cropper = PileUpCropper(pileUp, mask, options); 64 cropper.buildDb(); 65 66 return cropper.result; 67 } 68 69 private struct PileUpCropper 70 { 71 PileUp pileUp; 72 const(ReferenceRegion) repeatMask; 73 const(CropOptions) options; 74 private ReferencePoint[] croppingRefPositions; 75 private string croppedDb; 76 77 @property auto result() 78 { 79 return tuple!("db", "referencePositions")(croppedDb, croppingRefPositions); 80 } 81 82 void buildDb() 83 { 84 fetchCroppingRefPositions(); 85 auto croppedSequences = pileUpWithSequence().map!(t => getCroppedSequence(t.expand)); 86 croppedDb = buildDamFile(croppedSequences, options.workdir); 87 } 88 89 private void fetchCroppingRefPositions() 90 { 91 croppingRefPositions = getCroppingRefPositions(); 92 logJsonDebug("croppingRefPositions", croppingRefPositions.toJson); 93 94 foreach (refPos; croppingRefPositions) 95 { 96 if (refPos.value == -1) 97 { 98 if (shouldLog(LogLevel.diagnostic)) 99 { 100 auto involvedContigs = croppingRefPositions.map!"a.contigId".array; 101 auto localRepeatMask = repeatMask 102 .intervals 103 .filter!(i => involvedContigs.canFind(i.contigId)) 104 .array; 105 logJsonDiagnostic( 106 "info", "could not find a common trace point", 107 "croppingRefPositions", croppingRefPositions.toJson, 108 "tracePointDistance", options.tracePointDistance, 109 "pileUp", [ 110 "type": pileUp.getType.to!string.toJson, 111 "readAlignments": shouldLog(LogLevel.debug_) 112 ? pileUp.map!"a[]".array.toJson 113 : toJson(null), 114 ].toJson, 115 "repeatMask", localRepeatMask.toJson, 116 ); 117 } 118 119 throw new Exception("could not find a common trace point"); 120 } 121 } 122 } 123 124 private auto pileUpWithSequence() 125 { 126 auto readIds = pileUp.map!"a[0].contigB.id + 0".array; 127 128 return zip( 129 iota(pileUp.length), 130 pileUp, 131 getFastaSequences(options.readsDb, readIds, options.workdir), 132 ); 133 } 134 135 /** 136 Get points on the reference where the pileUp should be cropped. Returns 137 one common trace point for each involved contig. 138 139 See_Also: `getCommonTracePoint` 140 */ 141 private ReferencePoint[] getCroppingRefPositions() 142 { 143 auto alignmentsByContig = pileUp.splitAlignmentsByContigA(); 144 auto commonTracePoints = alignmentsByContig 145 .map!(acs => getCommonTracePoint(acs, repeatMask, options.tracePointDistance)); 146 auto contigAIds = alignmentsByContig.map!"a[0].contigA.id"; 147 148 auto cropPos = zip(contigAIds, commonTracePoints) 149 .map!(zipped => ReferencePoint(zipped.expand)) 150 .array; 151 152 debug logJsonDebug( 153 "alignmentsByContig", alignmentsByContig.toJson, 154 "pileUp", [ 155 "type": pileUp.getType.to!string.toJson, 156 "readAlignments": pileUp.map!"a[]".array.toJson, 157 ], 158 "cropPos", cropPos.toJson, 159 ); 160 161 return cropPos; 162 } 163 164 private string getCroppedSequence( 165 in size_t croppedDbIdx, 166 in ReadAlignment readAlignment, 167 in string readSequence, 168 ) 169 { 170 auto readCroppingSlice = getReadCroppingSlice(readAlignment); 171 debug logJsonDebug( 172 "croppedDbIdx", croppedDbIdx, 173 "readCroppingSlice", readCroppingSlice.toJson, 174 ); 175 176 return getCroppedReadAsFasta( 177 readSequence, 178 readCroppingSlice, 179 ); 180 } 181 182 private auto getReadCroppingSlice(in ReadAlignment readAlignment) 183 { 184 auto readCroppingSlice = readAlignment[] 185 .map!(alignment => alignment.getCroppingSlice(croppingRefPositions)) 186 .fold!"a & b"; 187 assert(!readCroppingSlice.empty, "invalid/empty read cropping slice"); 188 189 return readCroppingSlice; 190 } 191 192 private string getCroppedReadAsFasta(string readSequence, in ReadInterval readCroppingSlice) 193 { 194 static enum fastaLineWidth = 100; 195 auto fastaHeader = format!">read-%d"(readCroppingSlice.readId); 196 auto fastaLength = 197 fastaHeader.length + 1 + // header line + line break 198 readCroppingSlice.size + // sequence + ... 199 ceildiv(readCroppingSlice.size, fastaLineWidth); // line breaks for sequence + 200 // new line at the end of file 201 auto croppedFasta = minimallyInitializedArray!(ubyte[])(fastaLength); 202 chain( 203 fastaHeader[], 204 "\n", 205 readSequence[readCroppingSlice.begin .. readCroppingSlice.end] 206 .chunks(fastaLineWidth) 207 .joiner("\n"), 208 "\n", 209 ).copy(croppedFasta); 210 211 return cast(string) croppedFasta; 212 } 213 } 214 215 /// Sort alignment chains of pileUp into groups with the same `contigA.id`. 216 private SeededAlignment[][] splitAlignmentsByContigA(PileUp pileUp) 217 { 218 if (pileUp.length == 0) 219 { 220 return []; 221 } 222 223 auto orderedAlignments = pileUp 224 .map!"a[]" 225 .joiner 226 .array 227 .sort 228 .release; 229 auto alignmentsByContig = orderedAlignments.sliceBy!"a.contigA.id == b.contigA.id"; 230 231 return array(alignmentsByContig); 232 } 233 234 /// Returns a common trace points wrt. contigA that is not in mask and is on 235 /// the relevant half of the contig. 236 private long getCommonTracePoint( 237 in SeededAlignment[] alignments, 238 in ReferenceRegion mask, 239 in size_t tracePointDistance 240 ) 241 { 242 static long getCommonTracePoint(R)(R tracePointCandidates, ReferenceRegion allowedTracePointRegion) pure 243 { 244 auto commonTracePoints = tracePointCandidates.filter!(c => c in allowedTracePointRegion); 245 246 return commonTracePoints.empty ? -1 : commonTracePoints.front.value.to!long; 247 } 248 249 auto contigA = alignments[0].contigA; 250 auto locationSeed = alignments[0].seed; 251 auto commonAlignmentRegion = alignments 252 .map!(to!(ReferenceRegion, "contigA")) 253 .fold!"a & b"; 254 auto allowedTracePointRegion = commonAlignmentRegion - mask; 255 256 assert(alignments.all!(a => a.contigA == contigA && a.seed == locationSeed)); 257 debug logJsonDebug( 258 "alignments", alignments.toJson, 259 "commonAlignmentRegion", commonAlignmentRegion.toJson, 260 "allowedTracePointRegion", allowedTracePointRegion.toJson, 261 ); 262 263 if (allowedTracePointRegion.empty) 264 { 265 return -1; 266 } 267 268 auto tracePointMin = ceil(min(allowedTracePointRegion), tracePointDistance); 269 auto tracePointSup = ceil(sup(allowedTracePointRegion), tracePointDistance); 270 auto tracePointCandidates = iota(tracePointMin, tracePointSup, tracePointDistance) 271 .map!(tracePointCandidate => ReferencePoint(contigA.id, tracePointCandidate)); 272 273 return locationSeed == AlignmentLocationSeed.front 274 ? getCommonTracePoint(tracePointCandidates.retro, allowedTracePointRegion) 275 : getCommonTracePoint(tracePointCandidates, allowedTracePointRegion); 276 } 277 278 private ReadInterval getCroppingSlice( 279 in SeededAlignment alignment, 280 in ReferencePoint[] croppingRefPoints, 281 ) 282 { 283 auto locationSeed = alignment.seed; 284 auto read = alignment.contigB; 285 auto tracePointDistance = alignment.tracePointDistance; 286 auto croppingRefPos = croppingRefPoints 287 .filter!(p => p.contigId == alignment.contigA.id) 288 .front 289 .value; 290 291 auto readCroppingPos = alignment 292 .translateTracePoint(cast(coord_t) croppingRefPos, RoundingMode.floor) 293 .contigB; 294 size_t readBeginIdx; 295 size_t readEndIdx; 296 297 final switch (locationSeed) 298 { 299 case AlignmentLocationSeed.front: 300 readBeginIdx = 0; 301 readEndIdx = readCroppingPos; 302 break; 303 case AlignmentLocationSeed.back: 304 readBeginIdx = readCroppingPos; 305 readEndIdx = read.length; 306 break; 307 } 308 309 if (alignment.complement) 310 { 311 swap(readBeginIdx, readEndIdx); 312 readBeginIdx = read.length - readBeginIdx; 313 readEndIdx = read.length - readEndIdx; 314 } 315 316 auto croppingSlice = ReadInterval(read.id, readBeginIdx, readEndIdx); 317 318 debug logJsonDebug( 319 "alignment", alignment.toJson, 320 "croppingRefPoints", croppingRefPoints.toJson, 321 "croppingSlice", croppingSlice.toJson, 322 ); 323 324 return croppingSlice; 325 } 326 327 unittest 328 { 329 alias Contig = AlignmentChain.Contig; 330 alias Flags = AlignmentChain.Flags; 331 alias emptyFlags = AlignmentChain.emptyFlags; 332 alias complement = AlignmentChain.Flag.complement; 333 alias LocalAlignment = AlignmentChain.LocalAlignment; 334 alias Locus = LocalAlignment.Locus; 335 alias TracePoint = LocalAlignment.TracePoint; 336 enum tracePointDistance = 100; 337 338 { 339 enum alignment = SeededAlignment( 340 AlignmentChain( 341 0, 342 Contig(1, 2584), 343 Contig(58024, 10570), 344 Flags(complement), 345 [LocalAlignment( 346 Locus(579, 2584), 347 Locus(0, 2158), 348 292, 349 [ 350 TracePoint( 2, 23), 351 TracePoint(11, 109), 352 TracePoint(13, 109), 353 TracePoint(15, 107), 354 TracePoint(18, 107), 355 TracePoint(14, 103), 356 TracePoint(16, 106), 357 TracePoint(17, 106), 358 TracePoint( 9, 106), 359 TracePoint(14, 112), 360 TracePoint(16, 105), 361 TracePoint(16, 114), 362 TracePoint(10, 103), 363 TracePoint(14, 110), 364 TracePoint(15, 110), 365 TracePoint(15, 101), 366 TracePoint(17, 108), 367 TracePoint(17, 109), 368 TracePoint(15, 111), 369 TracePoint(17, 111), 370 TracePoint(11, 88), 371 ], 372 )], 373 tracePointDistance, 374 ), 375 AlignmentLocationSeed.back, 376 ); 377 378 assert( 379 getCroppingSlice(alignment, [ReferencePoint(1, 600)]) == 380 ReadInterval(58024, 0, 10547) 381 ); 382 assert( 383 getCroppingSlice(alignment, [ReferencePoint(1, 800)]) == 384 ReadInterval(58024, 0, 10329) 385 ); 386 } 387 { 388 enum alignment = SeededAlignment( 389 AlignmentChain( 390 0, 391 Contig(1, 2584), 392 Contig(7194, 9366), 393 emptyFlags, 394 [LocalAlignment( 395 Locus(0, 724), 396 Locus(8612, 9366), 397 76, 398 [ 399 TracePoint( 7, 107), 400 TracePoint(17, 106), 401 TracePoint(12, 96), 402 TracePoint( 9, 107), 403 TracePoint( 7, 102), 404 TracePoint(11, 105), 405 TracePoint(11, 105), 406 TracePoint( 2, 26), 407 ], 408 )], 409 tracePointDistance, 410 ), 411 AlignmentLocationSeed.front, 412 ); 413 414 assert( 415 getCroppingSlice(alignment, [ReferencePoint(1, 0)]) == 416 ReadInterval(7194, 0, 8612) 417 ); 418 assert( 419 getCroppingSlice(alignment, [ReferencePoint(1, 100)]) == 420 ReadInterval(7194, 0, 8719) 421 ); 422 assert( 423 getCroppingSlice(alignment, [ReferencePoint(1, 200)]) == 424 ReadInterval(7194, 0, 8825) 425 ); 426 } 427 }