1 /**
2     This package contains methods to handle the proprietary binary data
3     container for `Insertion`s.
4 
5     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
6     License: Subject to the terms of the MIT license, as written in the
7              included LICENSE file.
8     Authors: Arne Ludwig <arne.ludwig@posteo.de>
9 */
10 module dentist.common.binio.insertiondb;
11 
12 import core.exception : AssertError;
13 import dentist.common : ReferencePoint;
14 import dentist.common.alignments : AlignmentChain;
15 import dentist.common.binio._base :
16     ArrayStorage,
17     CompressedBaseQuad,
18     CompressedSequence,
19     DbIndex,
20     lockIfPossible,
21     readRecord,
22     readRecordAt,
23     readRecords;
24 import dentist.common.insertions :
25     Insertion,
26     InsertionInfo,
27     SpliceSite;
28 import dentist.common.scaffold :
29     ContigNode,
30     ContigPart;
31 import std.array : minimallyInitializedArray;
32 import std.conv : to;
33 import std.exception : assertThrown, enforce, ErrnoException;
34 import std.format : format;
35 import std.range :
36     ElementType,
37     empty,
38     front,
39     hasLength,
40     isForwardRange,
41     isInputRange,
42     popFront,
43     save;
44 import std.stdio : File;
45 import std.traits : isArray;
46 import std.typecons : tuple, Tuple;
47 
48 version (unittest) import dentist.common.binio._testdata :
49     getInsertionsTestData,
50     numCompressedBaseQuads,
51     numInsertions,
52     numSpliceSites;
53 
54 
55 class InsertionDbException : Exception
56 {
57     pure nothrow @nogc @safe this(string msg, string file = __FILE__,
58             size_t line = __LINE__, Throwable nextInChain = null)
59     {
60         super(msg, file, line, nextInChain);
61     }
62 }
63 
64 struct InsertionDb
65 {
66     private alias DbSlices = Tuple!(
67         ArrayStorage!(StorageType!Insertion), "insertions",
68         ArrayStorage!(StorageType!CompressedBaseQuad), "compressedBaseQuads",
69         ArrayStorage!(StorageType!SpliceSite), "spliceSites",
70     );
71 
72     private File file;
73     private InsertionDbIndex index;
74     private DbSlices slices;
75 
76     @property auto insertions() const pure nothrow
77     {
78         return index.insertions;
79     }
80 
81     @property auto compressedBaseQuads() const pure nothrow
82     {
83         return index.compressedBaseQuads;
84     }
85 
86     @property auto spliceSites() const pure nothrow
87     {
88         return index.spliceSites;
89     }
90 
91     static InsertionDb parse(in string dbFile)
92     {
93         auto file = File(dbFile, "rb");
94         lockIfPossible(file);
95         auto db = InsertionDb(file);
96         db.ensureDbIndex();
97 
98         return db;
99     }
100 
101     void releaseDb()
102     {
103         file.close();
104     }
105 
106     Insertion[] opIndex()
107     {
108         ensureDbIndex();
109 
110         return readSlice(0, length);
111     }
112 
113     Insertion opIndex(size_t i)
114     {
115         ensureDbIndex();
116         enforce!InsertionDbException(
117             i < length,
118             format!"cannot read block %d in `%s`: out of bounds [0, %d)"(
119                     i, file.name, length)
120         );
121 
122         return readSlice(i, i + 1)[0];
123     }
124 
125     Insertion[] opIndex(size_t[2] slice)
126     {
127         auto from = slice[0];
128         auto to = slice[1];
129         ensureDbIndex();
130         enforce!InsertionDbException(
131             to <= length,
132             format!"cannot read blocks %d-%d in `%s`: out of bounds [0, %d]"(
133                     from, to, file.name, length)
134         );
135 
136         return readSlice(from, to);
137     }
138 
139     size_t[2] opSlice(size_t dim)(size_t from, size_t to)
140             if (dim == 0)
141     {
142         assert(from < to, "invalid slice");
143 
144         return [from, to];
145     }
146 
147     @property size_t length()
148     {
149         ensureDbIndex();
150 
151         return index.insertions.length;
152     }
153 
154     alias opDollar = length;
155 
156     private void ensureDbIndex()
157     {
158         if (index != index.init)
159             return;
160 
161         index = file.readRecord!InsertionDbIndex();
162     }
163 
164     private Insertion[] readSlice(size_t from, size_t to)
165     {
166         assert(from <= to && to <= length);
167 
168         if (from == to)
169             return [];
170 
171         // Step 1: determine memory requirements and DB slices
172         slices = getSlices(from, to);
173 
174         // Step 2: allocate minimally initialized memory for all blocks
175         auto insertions = minimallyInitializedArray!(Insertion[])(slices.insertions.length);
176         auto compressedBaseQuads = minimallyInitializedArray!(CompressedBaseQuad[])(slices.compressedBaseQuads.length);
177         auto spliceSites = minimallyInitializedArray!(SpliceSite[])(slices.spliceSites.length);
178 
179         // Step 3: parse each record for each block assigning already
180         //         allocated array slices to the array fields
181         parse(insertions, compressedBaseQuads, spliceSites);
182         parse(compressedBaseQuads);
183         parse(spliceSites);
184 
185         return insertions;
186     }
187 
188     private DbSlices getSlices(size_t from, size_t to)
189     {
190         auto insertions = index.insertions[from .. to];
191         auto firstInsertion = file.readRecordAt!(StorageType!Insertion)(insertions[0]);
192         auto lastInsertion = file.readRecordAt!(StorageType!Insertion)(insertions[$ - 1]);
193 
194         auto compressedBaseQuads = ArrayStorage!(StorageType!CompressedBaseQuad).fromPtrs(
195             firstInsertion.sequence[0],
196             lastInsertion.sequence[$],
197         );
198 
199         auto spliceSites = ArrayStorage!(StorageType!SpliceSite).fromPtrs(
200             firstInsertion.spliceSites[0],
201             lastInsertion.spliceSites[$],
202         );
203 
204         return DbSlices(
205             insertions,
206             compressedBaseQuads,
207             spliceSites,
208         );
209     }
210 
211     private void parse(
212         ref Insertion[] insertions,
213         CompressedBaseQuad[] compressedBaseQuads,
214         SpliceSite[] spliceSites,
215     )
216     {
217         file.seek(slices.insertions.ptr);
218 
219         size_t[2] compressedBaseQuadsSlice;
220         size_t[2] spliceSitesSlice;
221 
222         foreach (ref insertion; insertions)
223         {
224             auto insertionStorage = file.readRecord!(StorageType!Insertion);
225 
226             compressedBaseQuadsSlice[0] = compressedBaseQuadsSlice[1];
227             compressedBaseQuadsSlice[1] += insertionStorage.sequence.length;
228 
229             spliceSitesSlice[0] = spliceSitesSlice[1];
230             spliceSitesSlice[1] += insertionStorage.spliceSites.length;
231 
232             insertion = Insertion(
233                 insertionStorage.start,
234                 insertionStorage.end,
235                 InsertionInfo(
236                     CompressedSequence(
237                         compressedBaseQuads[
238                             compressedBaseQuadsSlice[0] .. compressedBaseQuadsSlice[1]
239                         ],
240                         insertionStorage.baseOffset,
241                         insertionStorage.sequenceLength,
242                     ),
243                     insertionStorage.contigLength,
244                     spliceSites[spliceSitesSlice[0] .. spliceSitesSlice[1]],
245                 ),
246             );
247         }
248     }
249 
250     private void parse(
251         ref SpliceSite[] spliceSites,
252     )
253     {
254         static assert(SpliceSite.sizeof == StorageType!SpliceSite.sizeof);
255         file.seek(slices.spliceSites.ptr);
256         spliceSites = file.readRecords(spliceSites);
257     }
258 
259     private void parse(
260         ref CompressedBaseQuad[] compressedBaseQuads,
261     )
262     {
263         static assert(CompressedBaseQuad.sizeof == StorageType!CompressedBaseQuad.sizeof);
264         file.seek(slices.compressedBaseQuads.ptr);
265         compressedBaseQuads = file.readRecords(compressedBaseQuads);
266     }
267 
268     static void write(R)(in string dbFile, R insertions)
269             if (isForwardRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
270     {
271         auto writer = InsertionDbFileWriter!R(File(dbFile, "wb"), insertions);
272 
273         lockIfPossible(writer.file);
274         writer.writeToFile();
275     }
276 }
277 
278 unittest
279 {
280     import dentist.util.tempfile : mkstemp;
281     import std.file : remove;
282 
283     auto insertions = getInsertionsTestData();
284 
285     enum totalDbSize =
286         InsertionDbIndex.sizeof +
287         StorageType!Insertion.sizeof * numInsertions +
288         StorageType!CompressedBaseQuad.sizeof * numCompressedBaseQuads +
289         StorageType!SpliceSite.sizeof * numSpliceSites;
290 
291     auto tmpDb = mkstemp("./.unittest-XXXXXX");
292     scope (exit)
293     {
294         tmpDb.file.close();
295         remove(tmpDb.name);
296     }
297 
298     InsertionDbFileWriter!(Insertion[])(tmpDb.file, insertions).writeToFile();
299     tmpDb.file.sync();
300 
301     assert(tmpDb.file.size == totalDbSize);
302 
303     tmpDb.file.rewind();
304     auto insertionDb = InsertionDb(tmpDb.file);
305 
306     assert(insertionDb[] == insertions);
307 }
308 
309 private struct InsertionDbFileWriter(R)
310         if (isForwardRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
311 {
312     File file;
313     R insertions;
314     InsertionDbIndex index;
315 
316     void writeToFile()
317     {
318         index = InsertionDbIndex.from(insertions.save);
319 
320         file.rawWrite([index]);
321         writeBlock!Insertion();
322         writeBlock!CompressedBaseQuad();
323         writeBlock!SpliceSite();
324     }
325 
326     void writeBlock(T : Insertion)()
327     {
328         auto compressedBaseQuads = index.compressedBaseQuads;
329         compressedBaseQuads.length = 0;
330         auto spliceSites = index.spliceSites;
331         spliceSites.length = 0;
332 
333         version (assert)
334         {
335             auto insertions = index.insertions;
336             insertions.length = 0;
337             assert(insertions.ptr == file.tell());
338         }
339         foreach (insertion; this.insertions.save)
340         {
341             compressedBaseQuads.length = insertion.payload.sequence.compressedLength;
342             spliceSites.length = insertion.payload.spliceSites.length;
343             auto insertionStorage = InsertionStorage(
344                 insertion.start,
345                 insertion.end,
346                 insertion.payload.sequence.baseOffset,
347                 insertion.payload.sequence.length,
348                 compressedBaseQuads,
349                 insertion.payload.contigLength,
350                 spliceSites,
351             );
352 
353             file.rawWrite([insertionStorage]);
354 
355             compressedBaseQuads.ptr = compressedBaseQuads[$];
356             spliceSites.ptr = spliceSites[$];
357             version (assert)
358             {
359                 ++insertions.length;
360                 assert(insertions[$] == file.tell());
361             }
362         }
363     }
364 
365     void writeBlock(T : CompressedBaseQuad)()
366     {
367         version (assert)
368         {
369             auto compressedBaseQuads = index.compressedBaseQuads;
370             compressedBaseQuads.length = 0;
371             assert(compressedBaseQuads.ptr == file.tell());
372         }
373         foreach (insertion; this.insertions.save)
374         {
375             static assert(CompressedBaseQuad.sizeof == StorageType!CompressedBaseQuad.sizeof);
376             file.rawWrite(insertion.payload.sequence.data);
377 
378             version (assert)
379             {
380                 compressedBaseQuads.length += insertion.payload.sequence.compressedLength;
381                 assert(compressedBaseQuads[$] == file.tell());
382             }
383         }
384     }
385 
386     void writeBlock(T : SpliceSite)()
387     {
388         version (assert)
389         {
390             auto spliceSites = index.spliceSites;
391             spliceSites.length = 0;
392             assert(spliceSites.ptr == file.tell());
393         }
394         foreach (insertion; this.insertions.save)
395         {
396             static assert(SpliceSite.sizeof == StorageType!SpliceSite.sizeof);
397             file.rawWrite(insertion.payload.spliceSites);
398 
399             version (assert)
400             {
401                 spliceSites.length += insertion.payload.spliceSites.length;
402                 assert(spliceSites[$] == file.tell());
403             }
404         }
405     }
406 }
407 
408 private struct InsertionDbIndex
409 {
410     mixin DbIndex;
411 
412     private static template NextType(T)
413     {
414         static if (is(T == Insertion))
415             alias NextType = CompressedBaseQuad;
416         else static if (is(T == CompressedBaseQuad))
417             alias NextType = SpliceSite;
418         else static if (is(T == SpliceSite))
419             alias NextType = EOF;
420     }
421 
422     private static template fieldPtr(T)
423     {
424         static if (is(T == Insertion))
425             alias fieldPtr = insertionsPtr;
426         else static if (is(T == CompressedBaseQuad))
427             alias fieldPtr = compressedBaseQuadsPtr;
428         else static if (is(T == SpliceSite))
429             alias fieldPtr = spliceSitesPtr;
430         else static if (is(T == EOF))
431             alias fieldPtr = eofPtr;
432     }
433 
434     size_t insertionsPtr;
435     size_t compressedBaseQuadsPtr;
436     size_t spliceSitesPtr;
437     size_t eofPtr;
438 
439     @property alias insertions = arrayStorage!Insertion;
440     @property alias compressedBaseQuads = arrayStorage!CompressedBaseQuad;
441     @property alias spliceSites = arrayStorage!SpliceSite;
442 
443     static InsertionDbIndex from(R)(R insertions) nothrow pure
444             if (isInputRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
445     {
446         InsertionDbIndex index;
447 
448         index.beginPtr!Insertion = InsertionDbIndex.sizeof;
449         index.endPtr!Insertion = StorageType!Insertion.sizeof * insertions.length;
450         foreach (insertion; insertions)
451         {
452             index.endPtr!CompressedBaseQuad += StorageType!CompressedBaseQuad.sizeof *
453                     insertion.payload.sequence.compressedLength;
454             index.endPtr!SpliceSite += StorageType!SpliceSite.sizeof *
455                     insertion.payload.spliceSites.length;
456         }
457 
458         index.compressedBaseQuadsPtr += index.insertionsPtr;
459         index.spliceSitesPtr += index.compressedBaseQuadsPtr;
460         index.eofPtr += index.spliceSitesPtr;
461 
462         return index;
463     }
464 
465     unittest
466     {
467         auto dbIndex = InsertionDbIndex.from(getInsertionsTestData());
468 
469         assert(dbIndex.insertionsPtr == InsertionDbIndex.sizeof);
470         assert(dbIndex.compressedBaseQuadsPtr ==
471                 dbIndex.insertionsPtr +
472                 StorageType!Insertion.sizeof * numInsertions);
473         assert(dbIndex.spliceSitesPtr ==
474                 dbIndex.compressedBaseQuadsPtr +
475                 StorageType!CompressedBaseQuad.sizeof * numCompressedBaseQuads);
476         assert(dbIndex.eofPtr ==
477                 dbIndex.spliceSitesPtr +
478                 StorageType!SpliceSite.sizeof * numSpliceSites);
479     }
480 }
481 
482 unittest
483 {
484     enum begin = 1;
485     enum end = 2;
486     enum modified = 3;
487 
488     {
489         InsertionDbIndex dbIndex;
490 
491         dbIndex.insertionsPtr = begin;
492         dbIndex.compressedBaseQuadsPtr = end;
493 
494         assert(dbIndex.beginPtr!Insertion == begin);
495         assert(dbIndex.endPtr!Insertion == end);
496 
497         dbIndex.beginPtr!Insertion = modified;
498         dbIndex.endPtr!Insertion = modified;
499 
500         assert(dbIndex.insertionsPtr == modified);
501         assert(dbIndex.compressedBaseQuadsPtr == modified);
502     }
503     {
504         InsertionDbIndex dbIndex;
505 
506         dbIndex.compressedBaseQuadsPtr = begin;
507         dbIndex.spliceSitesPtr = end;
508 
509         assert(dbIndex.beginPtr!CompressedBaseQuad == begin);
510         assert(dbIndex.endPtr!CompressedBaseQuad == end);
511 
512         dbIndex.beginPtr!CompressedBaseQuad = modified;
513         dbIndex.endPtr!CompressedBaseQuad = modified;
514 
515         assert(dbIndex.compressedBaseQuadsPtr == modified);
516         assert(dbIndex.spliceSitesPtr == modified);
517     }
518     {
519         InsertionDbIndex dbIndex;
520 
521         dbIndex.spliceSitesPtr = begin;
522         dbIndex.eofPtr = end;
523 
524         assert(dbIndex.beginPtr!SpliceSite == begin);
525         assert(dbIndex.endPtr!SpliceSite == end);
526 
527         dbIndex.beginPtr!SpliceSite = modified;
528         dbIndex.endPtr!SpliceSite = modified;
529 
530         assert(dbIndex.spliceSitesPtr == modified);
531         assert(dbIndex.eofPtr == modified);
532     }
533 }
534 
535 private template StorageType(T)
536 {
537     static if (is(T == Insertion))
538         alias StorageType = InsertionStorage;
539     else static if (is(T == CompressedBaseQuad[]))
540         alias StorageType = ArrayStorage!(StorageType!CompressedBaseQuad);
541     else static if (is(T == CompressedBaseQuad))
542         alias StorageType = CompressedBaseQuad;
543     else static if (is(T == SpliceSite))
544         alias StorageType = SpliceSite;
545     else static if (is(T == SpliceSite[]))
546         alias StorageType = ArrayStorage!(StorageType!SpliceSite);
547 }
548 
549 private struct InsertionStorage
550 {
551     ContigNode start;
552     ContigNode end;
553     ubyte baseOffset;
554     size_t sequenceLength;
555     StorageType!(CompressedBaseQuad[]) sequence;
556     size_t contigLength;
557     StorageType!(SpliceSite[]) spliceSites;
558 }