dentist.commands.collectPileUps.filter source code

1 /**
2     This is a collection of helpers for the alignment filtering.
3 
4     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
5     License: Subject to the terms of the MIT license, as written in the
6              included LICENSE file.
7     Authors: Arne Ludwig <arne.ludwig@posteo.de>
8 */
9 module dentist.commands.collectPileUps.filter;
10 
11 import dentist.common : ReferenceRegion, to;
12 import dentist.common.alignments :
13     AlignmentChain,
14     haveEqualIds;
15 import dentist.util.math : NaturalNumberSet;
16 import std.algorithm :
17     all,
18     chunkBy,
19     filter,
20     joiner,
21     map,
22     sort,
23     uniq;
24 import std.array : array;
25 import std.range :
26     InputRange,
27     inputRangeObject,
28     walkLength;
29 
30 interface AlignmentChainFilter
31 {
32     AlignmentChain[] opCall(AlignmentChain[] alignmentChains);
33 }
34 
35 abstract class ReadFilter : AlignmentChainFilter
36 {
37     NaturalNumberSet* unusedReads;
38 
39     this(NaturalNumberSet* unusedReads)
40     {
41         this.unusedReads = unusedReads;
42     }
43 
44     override AlignmentChain[] opCall(AlignmentChain[] alignmentChains)
45     {
46         NaturalNumberSet discardedReadIds;
47         discardedReadIds.reserveFor(unusedReads.capacity);
48 
49         foreach (discardedAlignment; getDiscardedReadIds(alignmentChains))
50         {
51             auto discardedReadId = discardedAlignment.contigB.id;
52 
53             discardedReadIds.add(discardedReadId);
54             unusedReads.remove(discardedReadId);
55         }
56 
57         foreach (ref alignmentChain; alignmentChains)
58         {
59             auto wasReadDiscarded = discardedReadIds.has(alignmentChain.contigB.id);
60             alignmentChain.disableIf(wasReadDiscarded);
61         }
62 
63         return alignmentChains;
64     }
65 
66     InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains);
67 }
68 
69 /// Discard improper alignments.
70 class ImproperAlignmentChainsFilter : AlignmentChainFilter
71 {
72     override AlignmentChain[] opCall(AlignmentChain[] alignmentChains)
73     {
74         foreach (ref alignmentChain; alignmentChains)
75         {
76             alignmentChain.disableIf(!alignmentChain.isProper);
77         }
78 
79         return alignmentChains;
80     }
81 }
82 
83 /// Discard read if it has an alignment that – extended with the
84 /// exceeding read sequence on either end – is fully contained in
85 /// a single contig.
86 class RedundantAlignmentChainsFilter : ReadFilter
87 {
88     this(NaturalNumberSet* unusedReads)
89     {
90         super(unusedReads);
91     }
92 
93     override InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains)
94     {
95         assert(alignmentChains.map!"a.isProper || a.flags.disabled".all);
96 
97         return inputRangeObject(alignmentChains.filter!"!a.flags.disabled && a.isFullyContained");
98     }
99 }
100 
101 /// Discard read if it has an alignment that aligns in multiple
102 /// locations of one contig with approximately equal quality.
103 class AmbiguousAlignmentChainsFilter : ReadFilter
104 {
105     /**
106         Two scores are considered similar if the relative "error" of them is
107         smaller than defaulMaxRelativeDiff.
108 
109         The relative error is defined as:
110 
111             relError(a, b) = abs(a - b)/max(a - b)
112 
113         **Implementation note:** all computations are done in integer
114         arithmetic hence AlignmentChain.maxScore corresponds to 1 in the above
115         equation.
116     */
117     static enum maxRelativeDiff = AlignmentChain.maxScore / 20; // 5% of larger value
118     /**
119         Two scores are considered similar if the absolute "error" of them is
120         smaller than defaulMaxAbsoluteDiff.
121 
122         The relative error is defined as:
123 
124             absError(a, b) = abs(a - b)
125 
126         **Implementation note:** all computations are done in integer
127         arithmetic hence all scores are lower than or equal to
128         AlignmentChain.maxScore .
129     */
130     static enum maxAbsoluteDiff = AlignmentChain.maxScore / 100; // 1% wrt. score
131 
132     this(NaturalNumberSet* unusedReads)
133     {
134         super(unusedReads);
135     }
136 
137     override InputRange!(AlignmentChain) getDiscardedReadIds(AlignmentChain[] alignmentChains)
138     {
139         assert(alignmentChains.map!"a.isProper || a.flags.disabled".all);
140 
141         return alignmentChains
142             .filter!"!a.flags.disabled"
143             .chunkBy!haveEqualIds
144             .filter!isAmgiguouslyAlignedRead
145             .joiner
146             .inputRangeObject;
147     }
148 
149     static bool isAmgiguouslyAlignedRead(Chunk)(Chunk readAlignments)
150     {
151         return readAlignments.save.walkLength(2) > 1 && haveSimilarScore(readAlignments);
152     }
153 
154     static bool haveSimilarScore(Chunk)(Chunk readAlignments)
155     {
156         auto scores = readAlignments.map!"a.score".array.sort.release;
157 
158         return similarScore(scores[0], scores[1]);
159     }
160 
161     protected static bool similarScore(size_t a, size_t b) pure
162     {
163         auto diff = a > b ? a - b : b - a;
164         auto magnitude = a > b ? a : b;
165 
166         return diff < maxAbsoluteDiff || (diff * AlignmentChain.maxScore / magnitude) < maxRelativeDiff;
167     }
168 
169     unittest
170     {
171         enum eps = 1;
172         enum refValueSmall = 2 * maxAbsoluteDiff;
173         enum refValueLarge = AlignmentChain.maxScore / 2;
174 
175         // test absolute part
176         assert(similarScore(refValueSmall, refValueSmall));
177         assert(similarScore(refValueSmall, refValueSmall + (maxAbsoluteDiff - 1)));
178         assert(similarScore(refValueSmall, refValueSmall - (maxAbsoluteDiff - 1)));
179         assert(!similarScore(refValueSmall, refValueSmall + (maxAbsoluteDiff + 1)));
180         assert(!similarScore(refValueSmall, refValueSmall - (maxAbsoluteDiff + 1)));
181         // test relative part
182         assert(similarScore(refValueLarge, refValueLarge));
183         assert(similarScore(refValueLarge,
184                 refValueLarge + refValueLarge * maxRelativeDiff / AlignmentChain.maxScore));
185         assert(similarScore(refValueLarge,
186                 refValueLarge - refValueLarge * maxRelativeDiff / AlignmentChain.maxScore) + eps);
187         assert(!similarScore(refValueLarge,
188                 refValueLarge + refValueLarge * 2 * maxRelativeDiff / AlignmentChain.maxScore));
189         assert(!similarScore(refValueLarge,
190                 refValueLarge - refValueLarge * 2 * maxRelativeDiff / AlignmentChain.maxScore));
191     }
192 }
193 
194 class WeaklyAnchoredAlignmentChainsFilter : AlignmentChainFilter
195 {
196     size_t minAnchorLength;
197     const(ReferenceRegion) repetitiveRegions;
198 
199     this(const(ReferenceRegion) repetitiveRegions, size_t minAnchorLength)
200     {
201         this.repetitiveRegions = repetitiveRegions;
202         this.minAnchorLength = minAnchorLength;
203     }
204 
205     AlignmentChain[] opCall(AlignmentChain[] alignmentChains)
206     {
207         foreach (ref alignmentChain; alignmentChains)
208         {
209             alignmentChain.disableIf(isWeaklyAnchored(alignmentChain));
210         }
211 
212         return alignmentChains;
213     }
214 
215     bool isWeaklyAnchored(AlignmentChain alignment)
216     {
217         // Mark reads as weakly anchored if they are mostly anchored in a
218         // repetitive region
219         auto uniqueAlignmentRegion = alignment.to!(ReferenceRegion, "contigA") - repetitiveRegions;
220         bool isWeaklyAnchored = uniqueAlignmentRegion.size <= minAnchorLength;
221 
222         return isWeaklyAnchored;
223     }
224 }