1 /** 2 This is a collection of helpers for the alignment filtering. 3 4 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 5 License: Subject to the terms of the MIT license, as written in the 6 included LICENSE file. 7 Authors: Arne Ludwig <arne.ludwig@posteo.de> 8 */ 9 module dentist.commands.collectPileUps.filter; 10 11 import dentist.common : ReferenceRegion, to; 12 import dentist.common.alignments : 13 AlignmentChain, 14 haveEqualIds; 15 import dentist.util.math : NaturalNumberSet; 16 import std.algorithm : 17 all, 18 chunkBy, 19 filter, 20 joiner, 21 map, 22 sort, 23 uniq; 24 import std.array : array; 25 import std.range : 26 InputRange, 27 inputRangeObject, 28 walkLength; 29 30 interface AlignmentChainFilter 31 { 32 AlignmentChain[] opCall(AlignmentChain[] alignmentChains); 33 } 34 35 abstract class ReadFilter : AlignmentChainFilter 36 { 37 NaturalNumberSet* unusedReads; 38 39 this(NaturalNumberSet* unusedReads) 40 { 41 this.unusedReads = unusedReads; 42 } 43 44 override AlignmentChain[] opCall(AlignmentChain[] alignmentChains) 45 { 46 NaturalNumberSet discardedReadIds; 47 discardedReadIds.reserveFor(unusedReads.capacity); 48 49 foreach (discardedAlignment; getDiscardedReadIds(alignmentChains)) 50 { 51 auto discardedReadId = discardedAlignment.contigB.id; 52 53 discardedReadIds.add(discardedReadId); 54 unusedReads.remove(discardedReadId); 55 } 56 57 foreach (ref alignmentChain; alignmentChains) 58 { 59 auto wasReadDiscarded = discardedReadIds.has(alignmentChain.contigB.id); 60 alignmentChain.disableIf(wasReadDiscarded); 61 } 62 63 return alignmentChains; 64 } 65 66 InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains); 67 } 68 69 /// Discard improper alignments. 70 class ImproperAlignmentChainsFilter : AlignmentChainFilter 71 { 72 override AlignmentChain[] opCall(AlignmentChain[] alignmentChains) 73 { 74 foreach (ref alignmentChain; alignmentChains) 75 { 76 alignmentChain.disableIf(!alignmentChain.isProper); 77 } 78 79 return alignmentChains; 80 } 81 } 82 83 /// Discard read if it has an alignment that – extended with the 84 /// exceeding read sequence on either end – is fully contained in 85 /// a single contig. 86 class RedundantAlignmentChainsFilter : ReadFilter 87 { 88 this(NaturalNumberSet* unusedReads) 89 { 90 super(unusedReads); 91 } 92 93 override InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains) 94 { 95 assert(alignmentChains.map!"a.isProper || a.flags.disabled".all); 96 97 return inputRangeObject(alignmentChains.filter!"!a.flags.disabled && a.isFullyContained"); 98 } 99 } 100 101 /// Discard read if it has an alignment that aligns in multiple 102 /// locations of one contig with approximately equal quality. 103 class AmbiguousAlignmentChainsFilter : ReadFilter 104 { 105 /** 106 Two scores are considered similar if the relative "error" of them is 107 smaller than defaulMaxRelativeDiff. 108 109 The relative error is defined as: 110 111 relError(a, b) = abs(a - b)/max(a - b) 112 113 **Implementation note:** all computations are done in integer 114 arithmetic hence AlignmentChain.maxScore corresponds to 1 in the above 115 equation. 116 */ 117 static enum maxRelativeDiff = AlignmentChain.maxScore / 20; // 5% of larger value 118 /** 119 Two scores are considered similar if the absolute "error" of them is 120 smaller than defaulMaxAbsoluteDiff. 121 122 The relative error is defined as: 123 124 absError(a, b) = abs(a - b) 125 126 **Implementation note:** all computations are done in integer 127 arithmetic hence all scores are lower than or equal to 128 AlignmentChain.maxScore . 129 */ 130 static enum maxAbsoluteDiff = AlignmentChain.maxScore / 100; // 1% wrt. score 131 132 this(NaturalNumberSet* unusedReads) 133 { 134 super(unusedReads); 135 } 136 137 override InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains) 138 { 139 assert(alignmentChains.map!"a.isProper || a.flags.disabled".all); 140 141 return alignmentChains 142 .filter!"!a.flags.disabled" 143 .chunkBy!haveEqualIds 144 .filter!isAmgiguouslyAlignedRead 145 .joiner 146 .inputRangeObject; 147 } 148 149 static bool isAmgiguouslyAlignedRead(Chunk)(Chunk readAlignments) 150 { 151 return readAlignments.save.walkLength(2) > 1 && haveSimilarScore(readAlignments); 152 } 153 154 static bool haveSimilarScore(Chunk)(Chunk readAlignments) 155 { 156 auto scores = readAlignments.map!"a.score".array.sort.release; 157 158 return similarScore(scores[0], scores[1]); 159 } 160 161 protected static bool similarScore(size_t a, size_t b) pure 162 { 163 auto diff = a > b ? a - b : b - a; 164 auto magnitude = a > b ? a : b; 165 166 return diff < maxAbsoluteDiff || (diff * AlignmentChain.maxScore / magnitude) < maxRelativeDiff; 167 } 168 169 unittest 170 { 171 enum eps = 1; 172 enum refValueSmall = 2 * maxAbsoluteDiff; 173 enum refValueLarge = AlignmentChain.maxScore / 2; 174 175 // test absolute part 176 assert(similarScore(refValueSmall, refValueSmall)); 177 assert(similarScore(refValueSmall, refValueSmall + (maxAbsoluteDiff - 1))); 178 assert(similarScore(refValueSmall, refValueSmall - (maxAbsoluteDiff - 1))); 179 assert(!similarScore(refValueSmall, refValueSmall + (maxAbsoluteDiff + 1))); 180 assert(!similarScore(refValueSmall, refValueSmall - (maxAbsoluteDiff + 1))); 181 // test relative part 182 assert(similarScore(refValueLarge, refValueLarge)); 183 assert(similarScore(refValueLarge, 184 refValueLarge + refValueLarge * maxRelativeDiff / AlignmentChain.maxScore)); 185 assert(similarScore(refValueLarge, 186 refValueLarge - refValueLarge * maxRelativeDiff / AlignmentChain.maxScore) + eps); 187 assert(!similarScore(refValueLarge, 188 refValueLarge + refValueLarge * 2 * maxRelativeDiff / AlignmentChain.maxScore)); 189 assert(!similarScore(refValueLarge, 190 refValueLarge - refValueLarge * 2 * maxRelativeDiff / AlignmentChain.maxScore)); 191 } 192 } 193 194 class WeaklyAnchoredAlignmentChainsFilter : AlignmentChainFilter 195 { 196 size_t minAnchorLength; 197 const(ReferenceRegion) repetitiveRegions; 198 199 this(const(ReferenceRegion) repetitiveRegions, size_t minAnchorLength) 200 { 201 this.repetitiveRegions = repetitiveRegions; 202 this.minAnchorLength = minAnchorLength; 203 } 204 205 AlignmentChain[] opCall(AlignmentChain[] alignmentChains) 206 { 207 foreach (ref alignmentChain; alignmentChains) 208 { 209 alignmentChain.disableIf(isWeaklyAnchored(alignmentChain)); 210 } 211 212 return alignmentChains; 213 } 214 215 bool isWeaklyAnchored(AlignmentChain alignment) 216 { 217 // Mark reads as weakly anchored if they are mostly anchored in a 218 // repetitive region 219 auto uniqueAlignmentRegion = alignment.to!(ReferenceRegion, "contigA") - repetitiveRegions; 220 bool isWeaklyAnchored = uniqueAlignmentRegion.size <= minAnchorLength; 221 222 return isWeaklyAnchored; 223 } 224 }