1 /**
2 This package contains methods to handle the proprietary binary data
3 container for `Insertion`s.
4
5 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
6 License: Subject to the terms of the MIT license, as written in the
7 included LICENSE file.
8 Authors: Arne Ludwig <arne.ludwig@posteo.de>
9 */
10 module dentist.common.binio.insertiondb;
11
12 import core.exception : AssertError;
13 import dentist.common : ReferencePoint;
14 import dentist.common.alignments : AlignmentChain;
15 import dentist.common.binio._base :
16 ArrayStorage,
17 CompressedBaseQuad,
18 CompressedSequence,
19 DbIndex,
20 lockIfPossible,
21 readRecord,
22 readRecordAt,
23 readRecords;
24 import dentist.common.insertions :
25 Insertion,
26 InsertionInfo,
27 SpliceSite;
28 import dentist.common.scaffold :
29 ContigNode,
30 ContigPart;
31 import std.array : minimallyInitializedArray;
32 import std.conv : to;
33 import std.exception : assertThrown, enforce, ErrnoException;
34 import std.format : format;
35 import std.range :
36 ElementType,
37 empty,
38 front,
39 hasLength,
40 isForwardRange,
41 isInputRange,
42 popFront,
43 save;
44 import std.stdio : File;
45 import std.traits : isArray;
46 import std.typecons : tuple, Tuple;
47
48 version (unittest) import dentist.common.binio._testdata :
49 getInsertionsTestData,
50 numCompressedBaseQuads,
51 numInsertions,
52 numSpliceSites;
53
54
55 class InsertionDbException : Exception
56 {
57 pure nothrow @nogc @safe this(string msg, string file = __FILE__,
58 size_t line = __LINE__, Throwable nextInChain = null)
59 {
60 super(msg, file, line, nextInChain);
61 }
62 }
63
64 struct InsertionDb
65 {
66 private alias DbSlices = Tuple!(
67 ArrayStorage!(StorageType!Insertion), "insertions",
68 ArrayStorage!(StorageType!CompressedBaseQuad), "compressedBaseQuads",
69 ArrayStorage!(StorageType!SpliceSite), "spliceSites",
70 );
71
72 private File file;
73 private InsertionDbIndex index;
74 private DbSlices slices;
75
76 @property auto insertions() const pure nothrow
77 {
78 return index.insertions;
79 }
80
81 @property auto compressedBaseQuads() const pure nothrow
82 {
83 return index.compressedBaseQuads;
84 }
85
86 @property auto spliceSites() const pure nothrow
87 {
88 return index.spliceSites;
89 }
90
91 static InsertionDb parse(in string dbFile)
92 {
93 auto file = File(dbFile, "rb");
94 lockIfPossible(file);
95 auto db = InsertionDb(file);
96 db.ensureDbIndex();
97
98 return db;
99 }
100
101 void releaseDb()
102 {
103 file.close();
104 }
105
106 Insertion[] opIndex()
107 {
108 ensureDbIndex();
109
110 return readSlice(0, length);
111 }
112
113 Insertion opIndex(size_t i)
114 {
115 ensureDbIndex();
116 enforce!InsertionDbException(
117 i < length,
118 format!"cannot read block %d in `%s`: out of bounds [0, %d)"(
119 i, file.name, length)
120 );
121
122 return readSlice(i, i + 1)[0];
123 }
124
125 Insertion[] opIndex(size_t[2] slice)
126 {
127 auto from = slice[0];
128 auto to = slice[1];
129 ensureDbIndex();
130 enforce!InsertionDbException(
131 to <= length,
132 format!"cannot read blocks %d-%d in `%s`: out of bounds [0, %d]"(
133 from, to, file.name, length)
134 );
135
136 return readSlice(from, to);
137 }
138
139 size_t[2] opSlice(size_t dim)(size_t from, size_t to)
140 if (dim == 0)
141 {
142 assert(from < to, "invalid slice");
143
144 return [from, to];
145 }
146
147 @property size_t length()
148 {
149 ensureDbIndex();
150
151 return index.insertions.length;
152 }
153
154 alias opDollar = length;
155
156 private void ensureDbIndex()
157 {
158 if (index != index.init)
159 return;
160
161 index = file.readRecord!InsertionDbIndex();
162 }
163
164 private Insertion[] readSlice(size_t from, size_t to)
165 {
166 assert(from <= to && to <= length);
167
168 if (from == to)
169 return [];
170
171 // Step 1: determine memory requirements and DB slices
172 slices = getSlices(from, to);
173
174 // Step 2: allocate minimally initialized memory for all blocks
175 auto insertions = minimallyInitializedArray!(Insertion[])(slices.insertions.length);
176 auto compressedBaseQuads = minimallyInitializedArray!(CompressedBaseQuad[])(slices.compressedBaseQuads.length);
177 auto spliceSites = minimallyInitializedArray!(SpliceSite[])(slices.spliceSites.length);
178
179 // Step 3: parse each record for each block assigning already
180 // allocated array slices to the array fields
181 parse(insertions, compressedBaseQuads, spliceSites);
182 parse(compressedBaseQuads);
183 parse(spliceSites);
184
185 return insertions;
186 }
187
188 private DbSlices getSlices(size_t from, size_t to)
189 {
190 auto insertions = index.insertions[from .. to];
191 auto firstInsertion = file.readRecordAt!(StorageType!Insertion)(insertions[0]);
192 auto lastInsertion = file.readRecordAt!(StorageType!Insertion)(insertions[$ - 1]);
193
194 auto compressedBaseQuads = ArrayStorage!(StorageType!CompressedBaseQuad).fromPtrs(
195 firstInsertion.sequence[0],
196 lastInsertion.sequence[$],
197 );
198
199 auto spliceSites = ArrayStorage!(StorageType!SpliceSite).fromPtrs(
200 firstInsertion.spliceSites[0],
201 lastInsertion.spliceSites[$],
202 );
203
204 return DbSlices(
205 insertions,
206 compressedBaseQuads,
207 spliceSites,
208 );
209 }
210
211 private void parse(
212 ref Insertion[] insertions,
213 CompressedBaseQuad[] compressedBaseQuads,
214 SpliceSite[] spliceSites,
215 )
216 {
217 file.seek(slices.insertions.ptr);
218
219 size_t[2] compressedBaseQuadsSlice;
220 size_t[2] spliceSitesSlice;
221
222 foreach (ref insertion; insertions)
223 {
224 auto insertionStorage = file.readRecord!(StorageType!Insertion);
225
226 compressedBaseQuadsSlice[0] = compressedBaseQuadsSlice[1];
227 compressedBaseQuadsSlice[1] += insertionStorage.sequence.length;
228
229 spliceSitesSlice[0] = spliceSitesSlice[1];
230 spliceSitesSlice[1] += insertionStorage.spliceSites.length;
231
232 insertion = Insertion(
233 insertionStorage.start,
234 insertionStorage.end,
235 InsertionInfo(
236 CompressedSequence(
237 compressedBaseQuads[
238 compressedBaseQuadsSlice[0] .. compressedBaseQuadsSlice[1]
239 ],
240 insertionStorage.baseOffset,
241 insertionStorage.sequenceLength,
242 ),
243 insertionStorage.contigLength,
244 spliceSites[spliceSitesSlice[0] .. spliceSitesSlice[1]],
245 ),
246 );
247 }
248 }
249
250 private void parse(
251 ref SpliceSite[] spliceSites,
252 )
253 {
254 static assert(SpliceSite.sizeof == StorageType!SpliceSite.sizeof);
255 file.seek(slices.spliceSites.ptr);
256 spliceSites = file.readRecords(spliceSites);
257 }
258
259 private void parse(
260 ref CompressedBaseQuad[] compressedBaseQuads,
261 )
262 {
263 static assert(CompressedBaseQuad.sizeof == StorageType!CompressedBaseQuad.sizeof);
264 file.seek(slices.compressedBaseQuads.ptr);
265 compressedBaseQuads = file.readRecords(compressedBaseQuads);
266 }
267
268 static void write(R)(in string dbFile, R insertions)
269 if (isForwardRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
270 {
271 auto writer = InsertionDbFileWriter!R(File(dbFile, "wb"), insertions);
272
273 lockIfPossible(writer.file);
274 writer.writeToFile();
275 }
276 }
277
278 unittest
279 {
280 import dentist.util.tempfile : mkstemp;
281 import std.file : remove;
282
283 auto insertions = getInsertionsTestData();
284
285 enum totalDbSize =
286 InsertionDbIndex.sizeof +
287 StorageType!Insertion.sizeof * numInsertions +
288 StorageType!CompressedBaseQuad.sizeof * numCompressedBaseQuads +
289 StorageType!SpliceSite.sizeof * numSpliceSites;
290
291 auto tmpDb = mkstemp("./.unittest-XXXXXX");
292 scope (exit)
293 {
294 tmpDb.file.close();
295 remove(tmpDb.name);
296 }
297
298 InsertionDbFileWriter!(Insertion[])(tmpDb.file, insertions).writeToFile();
299 tmpDb.file.sync();
300
301 assert(tmpDb.file.size == totalDbSize);
302
303 tmpDb.file.rewind();
304 auto insertionDb = InsertionDb(tmpDb.file);
305
306 assert(insertionDb[] == insertions);
307 }
308
309 private struct InsertionDbFileWriter(R)
310 if (isForwardRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
311 {
312 File file;
313 R insertions;
314 InsertionDbIndex index;
315
316 void writeToFile()
317 {
318 index = InsertionDbIndex.from(insertions.save);
319
320 file.rawWrite([index]);
321 writeBlock!Insertion();
322 writeBlock!CompressedBaseQuad();
323 writeBlock!SpliceSite();
324 }
325
326 void writeBlock(T : Insertion)()
327 {
328 auto compressedBaseQuads = index.compressedBaseQuads;
329 compressedBaseQuads.length = 0;
330 auto spliceSites = index.spliceSites;
331 spliceSites.length = 0;
332
333 version (assert)
334 {
335 auto insertions = index.insertions;
336 insertions.length = 0;
337 assert(insertions.ptr == file.tell());
338 }
339 foreach (insertion; this.insertions.save)
340 {
341 compressedBaseQuads.length = insertion.payload.sequence.compressedLength;
342 spliceSites.length = insertion.payload.spliceSites.length;
343 auto insertionStorage = InsertionStorage(
344 insertion.start,
345 insertion.end,
346 insertion.payload.sequence.baseOffset,
347 insertion.payload.sequence.length,
348 compressedBaseQuads,
349 insertion.payload.contigLength,
350 spliceSites,
351 );
352
353 file.rawWrite([insertionStorage]);
354
355 compressedBaseQuads.ptr = compressedBaseQuads[$];
356 spliceSites.ptr = spliceSites[$];
357 version (assert)
358 {
359 ++insertions.length;
360 assert(insertions[$] == file.tell());
361 }
362 }
363 }
364
365 void writeBlock(T : CompressedBaseQuad)()
366 {
367 version (assert)
368 {
369 auto compressedBaseQuads = index.compressedBaseQuads;
370 compressedBaseQuads.length = 0;
371 assert(compressedBaseQuads.ptr == file.tell());
372 }
373 foreach (insertion; this.insertions.save)
374 {
375 static assert(CompressedBaseQuad.sizeof == StorageType!CompressedBaseQuad.sizeof);
376 file.rawWrite(insertion.payload.sequence.data);
377
378 version (assert)
379 {
380 compressedBaseQuads.length += insertion.payload.sequence.compressedLength;
381 assert(compressedBaseQuads[$] == file.tell());
382 }
383 }
384 }
385
386 void writeBlock(T : SpliceSite)()
387 {
388 version (assert)
389 {
390 auto spliceSites = index.spliceSites;
391 spliceSites.length = 0;
392 assert(spliceSites.ptr == file.tell());
393 }
394 foreach (insertion; this.insertions.save)
395 {
396 static assert(SpliceSite.sizeof == StorageType!SpliceSite.sizeof);
397 file.rawWrite(insertion.payload.spliceSites);
398
399 version (assert)
400 {
401 spliceSites.length += insertion.payload.spliceSites.length;
402 assert(spliceSites[$] == file.tell());
403 }
404 }
405 }
406 }
407
408 private struct InsertionDbIndex
409 {
410 mixin DbIndex;
411
412 private static template NextType(T)
413 {
414 static if (is(T == Insertion))
415 alias NextType = CompressedBaseQuad;
416 else static if (is(T == CompressedBaseQuad))
417 alias NextType = SpliceSite;
418 else static if (is(T == SpliceSite))
419 alias NextType = EOF;
420 }
421
422 private static template fieldPtr(T)
423 {
424 static if (is(T == Insertion))
425 alias fieldPtr = insertionsPtr;
426 else static if (is(T == CompressedBaseQuad))
427 alias fieldPtr = compressedBaseQuadsPtr;
428 else static if (is(T == SpliceSite))
429 alias fieldPtr = spliceSitesPtr;
430 else static if (is(T == EOF))
431 alias fieldPtr = eofPtr;
432 }
433
434 size_t insertionsPtr;
435 size_t compressedBaseQuadsPtr;
436 size_t spliceSitesPtr;
437 size_t eofPtr;
438
439 @property alias insertions = arrayStorage!Insertion;
440 @property alias compressedBaseQuads = arrayStorage!CompressedBaseQuad;
441 @property alias spliceSites = arrayStorage!SpliceSite;
442
443 static InsertionDbIndex from(R)(R insertions) nothrow pure
444 if (isInputRange!R && hasLength!R && is(ElementType!R : const(Insertion)))
445 {
446 InsertionDbIndex index;
447
448 index.beginPtr!Insertion = InsertionDbIndex.sizeof;
449 index.endPtr!Insertion = StorageType!Insertion.sizeof * insertions.length;
450 foreach (insertion; insertions)
451 {
452 index.endPtr!CompressedBaseQuad += StorageType!CompressedBaseQuad.sizeof *
453 insertion.payload.sequence.compressedLength;
454 index.endPtr!SpliceSite += StorageType!SpliceSite.sizeof *
455 insertion.payload.spliceSites.length;
456 }
457
458 index.compressedBaseQuadsPtr += index.insertionsPtr;
459 index.spliceSitesPtr += index.compressedBaseQuadsPtr;
460 index.eofPtr += index.spliceSitesPtr;
461
462 return index;
463 }
464
465 unittest
466 {
467 auto dbIndex = InsertionDbIndex.from(getInsertionsTestData());
468
469 assert(dbIndex.insertionsPtr == InsertionDbIndex.sizeof);
470 assert(dbIndex.compressedBaseQuadsPtr ==
471 dbIndex.insertionsPtr +
472 StorageType!Insertion.sizeof * numInsertions);
473 assert(dbIndex.spliceSitesPtr ==
474 dbIndex.compressedBaseQuadsPtr +
475 StorageType!CompressedBaseQuad.sizeof * numCompressedBaseQuads);
476 assert(dbIndex.eofPtr ==
477 dbIndex.spliceSitesPtr +
478 StorageType!SpliceSite.sizeof * numSpliceSites);
479 }
480 }
481
482 unittest
483 {
484 enum begin = 1;
485 enum end = 2;
486 enum modified = 3;
487
488 {
489 InsertionDbIndex dbIndex;
490
491 dbIndex.insertionsPtr = begin;
492 dbIndex.compressedBaseQuadsPtr = end;
493
494 assert(dbIndex.beginPtr!Insertion == begin);
495 assert(dbIndex.endPtr!Insertion == end);
496
497 dbIndex.beginPtr!Insertion = modified;
498 dbIndex.endPtr!Insertion = modified;
499
500 assert(dbIndex.insertionsPtr == modified);
501 assert(dbIndex.compressedBaseQuadsPtr == modified);
502 }
503 {
504 InsertionDbIndex dbIndex;
505
506 dbIndex.compressedBaseQuadsPtr = begin;
507 dbIndex.spliceSitesPtr = end;
508
509 assert(dbIndex.beginPtr!CompressedBaseQuad == begin);
510 assert(dbIndex.endPtr!CompressedBaseQuad == end);
511
512 dbIndex.beginPtr!CompressedBaseQuad = modified;
513 dbIndex.endPtr!CompressedBaseQuad = modified;
514
515 assert(dbIndex.compressedBaseQuadsPtr == modified);
516 assert(dbIndex.spliceSitesPtr == modified);
517 }
518 {
519 InsertionDbIndex dbIndex;
520
521 dbIndex.spliceSitesPtr = begin;
522 dbIndex.eofPtr = end;
523
524 assert(dbIndex.beginPtr!SpliceSite == begin);
525 assert(dbIndex.endPtr!SpliceSite == end);
526
527 dbIndex.beginPtr!SpliceSite = modified;
528 dbIndex.endPtr!SpliceSite = modified;
529
530 assert(dbIndex.spliceSitesPtr == modified);
531 assert(dbIndex.eofPtr == modified);
532 }
533 }
534
535 private template StorageType(T)
536 {
537 static if (is(T == Insertion))
538 alias StorageType = InsertionStorage;
539 else static if (is(T == CompressedBaseQuad[]))
540 alias StorageType = ArrayStorage!(StorageType!CompressedBaseQuad);
541 else static if (is(T == CompressedBaseQuad))
542 alias StorageType = CompressedBaseQuad;
543 else static if (is(T == SpliceSite))
544 alias StorageType = SpliceSite;
545 else static if (is(T == SpliceSite[]))
546 alias StorageType = ArrayStorage!(StorageType!SpliceSite);
547 }
548
549 private struct InsertionStorage
550 {
551 ContigNode start;
552 ContigNode end;
553 ubyte baseOffset;
554 size_t sequenceLength;
555 StorageType!(CompressedBaseQuad[]) sequence;
556 size_t contigLength;
557 StorageType!(SpliceSite[]) spliceSites;
558 }