1 /**
2     Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de>
3     License: Subject to the terms of the MIT license, as written in the
4              included LICENSE file.
5     Authors: Arne Ludwig <arne.ludwig@posteo.de>
6 */
7 module dentist.common.binio._base;
8 
9 import dentist.util.log;
10 import dentist.util.math : ceil, ceildiv;
11 import std.algorithm :
12     all,
13     among,
14     joiner,
15     map,
16     permutations;
17 import std.array : minimallyInitializedArray;
18 import std.bitmanip : bitfields;
19 import std.conv : to;
20 import std.exception : enforce, ErrnoException;
21 import std.format : format;
22 import std.range :
23     chunks,
24     dropExactly,
25     empty,
26     enumerate,
27     front,
28     popFront,
29     retro,
30     takeExactly;
31 import std.stdio : File;
32 import std..string : toLower;
33 import std.traits :
34     isArray,
35     isSomeChar,
36     isSomeString,
37     Unqual;
38 import std.typecons : BitFlags, Flag, No, Yes;
39 
40 
41 void lockIfPossible(ref File file) nothrow
42 {
43     try
44     {
45         file.lock();
46     }
47     catch (Exception e)
48     {
49         logJsonDebug(
50             "info", "could not lock file `%s`",
51             "file", file.name,
52             "error", e.message().to!string,
53         );
54     }
55 }
56 
57 class BinaryIOException : Exception
58 {
59     pure nothrow @nogc @safe this(string msg, string file = __FILE__,
60             size_t line = __LINE__, Throwable nextInChain = null)
61     {
62         super(msg, file, line, nextInChain);
63     }
64 }
65 
66 mixin template DbIndex(T...)
67 {
68     static struct EOF;
69 
70     @property auto ref size_t beginPtr(T)() pure nothrow
71     {
72         return fieldPtr!T;
73     }
74 
75     @property size_t beginPtr(T)() const pure nothrow
76     {
77         return fieldPtr!T;
78     }
79 
80     @property auto ref size_t endPtr(T)() pure nothrow
81     {
82         return fieldPtr!(NextType!T);
83     }
84 
85     @property size_t endPtr(T)() const pure nothrow
86     {
87         return fieldPtr!(NextType!T);
88     }
89 
90     @property ArrayStorage!(StorageType!T) arrayStorage(T)() const pure nothrow
91     {
92         return typeof(return).fromPtrs(beginPtr!T, endPtr!T);
93     }
94 }
95 
96 T readRecordAt(T)(File dbFile, size_t ptr)
97 {
98     dbFile.seek(ptr.to!long);
99 
100     return readRecord!T(dbFile);
101 }
102 
103 T readRecord(T)(File dbFile)
104 {
105     return readRecords(dbFile, new T[1])[0];
106 }
107 
108 T readRecords(T)(File dbFile, T records) if (isArray!T)
109 {
110     if (records.length == 0)
111         return records;
112 
113     auto expectedLength = records.length;
114     try
115     {
116         records = dbFile.rawRead(records);
117 
118         enforce!BinaryIOException(
119             records.length == expectedLength,
120             format!"malformed DB `%s`: premature end of file"(
121                     dbFile.name)
122         );
123     }
124     catch (ErrnoException e)
125     {
126         throw new BinaryIOException(
127             format!"malformed DB `%s`: cannot read record of type %s: %s"(
128                     dbFile.name, T.stringof, e.msg),
129         );
130     }
131 
132     return records;
133 }
134 
135 struct ArrayStorage(T)
136 {
137     enum elementSize = T.sizeof;
138     size_t ptr;
139     size_t length;
140 
141     ArrayStorage!T opIndex() const pure nothrow
142     {
143         return this;
144     }
145 
146     size_t opIndex(size_t i) const pure nothrow
147     {
148         assert(i <= length, "out of bounds");
149 
150         return ptr + i * elementSize;
151     }
152 
153     ArrayStorage!T opIndex(size_t[2] slice) const pure nothrow
154     {
155         auto from = slice[0];
156         auto to = slice[1];
157         assert(from < to && to <= length, "out of bounds");
158 
159         return typeof(return)(this[from], to - from);
160     }
161 
162     size_t[2] opSlice(size_t dim)(size_t from, size_t to) const pure nothrow
163             if (dim == 0)
164     {
165         assert(from < to, "slice `[from .. to]` must have `from < to`");
166 
167         return [from, to];
168     }
169 
170     size_t opDollar() const pure nothrow
171     {
172         return length;
173     }
174 
175     long indexOf(size_t ptr) const pure
176     {
177         assert((ptr.to!long - this.ptr.to!long) % elementSize.to!long == 0, "bad pointer alignment");
178 
179         return (ptr.to!long - this.ptr.to!long) / elementSize.to!long;
180     }
181 
182     static ArrayStorage!T fromPtrs(size_t fromPtr, size_t toPtr)
183     {
184         assert((toPtr - fromPtr) % elementSize == 0, "bad pointer alignment");
185 
186         return typeof(return)(fromPtr, (toPtr - fromPtr) / elementSize);
187     }
188 }
189 
190 unittest
191 {
192     import core.exception : AssertError;
193     import std.exception : assertThrown;
194 
195     alias T = int;
196     static assert(ArrayStorage!T.elementSize == T.sizeof);
197     enum basePtr = 1337;
198     enum length = 42;
199 
200     ArrayStorage!T storage;
201     storage.ptr = basePtr;
202     storage.length = length;
203 
204     assert(storage[0] == basePtr);
205     assert(storage[13] == basePtr + 13 * T.sizeof);
206     assert(storage[$] == storage[length]);
207     assertThrown!AssertError(storage[length + 1]);
208 
209     auto storageCopy = storage[];
210     assert(storageCopy == storage);
211     --storageCopy.length;
212     assert(storageCopy.length != storage.length);
213 
214     storageCopy = storage[0 .. $];
215     assert(storageCopy == storage);
216     --storageCopy.length;
217     assert(storageCopy.length != storage.length);
218 
219     auto storageSlice = storage[1 .. 13];
220     assert(storageSlice.ptr == storage[1]);
221     assert(storageSlice.length == 12);
222 
223     assert(ArrayStorage!T.fromPtrs(storage[1], storage[13]) == storageSlice);
224 
225     assert(storage.indexOf(basePtr + 13 * T.sizeof) == 13);
226     assert(storage.indexOf(basePtr - 13 * T.sizeof) == -13);
227     assertThrown!AssertError(storage.indexOf(basePtr + 1));
228 }
229 
230 enum CompressedBase : ubyte
231 {
232     a = 0b00,
233     c = 0b01,
234     t = 0b10,
235     g = 0b11,
236 }
237 
238 struct CompressedBaseQuad
239 {
240     private
241     {
242         mixin(bitfields!(
243             CompressedBase, "_0", 2,
244             CompressedBase, "_1", 2,
245             CompressedBase, "_2", 2,
246             CompressedBase, "_3", 2,
247         ));
248     }
249 
250     enum length = 4;
251     alias opDollar = length;
252 
253     CompressedBase[] opIndex() const pure nothrow
254     {
255         return [_0, _1, _2, _3];
256     }
257 
258     CompressedBase opIndex(size_t i) const pure nothrow
259     {
260         switch (i)
261         {
262         case 0:
263             return _0;
264         case 1:
265             return _1;
266         case 2:
267             return _2;
268         case 3:
269             return _3;
270         default:
271             assert(0, "index out of bounds: " ~ i.to!string);
272         }
273     }
274 
275     void opIndexAssign(CompressedBase base, size_t i) pure nothrow
276     {
277         switch (i)
278         {
279         case 0:
280             _0 = base;
281             return;
282         case 1:
283             _1 = base;
284             return;
285         case 2:
286             _2 = base;
287             return;
288         case 3:
289             _3 = base;
290             return;
291         default:
292             assert(0, "index out of bounds: " ~ i.to!string);
293         }
294     }
295 }
296 
297 struct CompressedSequence
298 {
299     private CompressedBaseQuad[] _data;
300     private ubyte _baseOffset;
301     private size_t numBases;
302 
303     static CompressedSequence from(S)(S fastaString) if (isSomeString!S)
304     {
305         auto compressedLength = ceil(fastaString.length, 4) / 4;
306         auto data = minimallyInitializedArray!(CompressedBaseQuad[])(compressedLength);
307 
308         foreach (i, baseQuad; fastaString.chunks(4).enumerate)
309         {
310             static foreach (j; 0 .. 4)
311             {{
312                 if (!baseQuad.empty)
313                 {
314                     data[i][j] = convert(baseQuad.front);
315                     baseQuad.popFront();
316                 }
317 
318             }}
319         }
320 
321         auto compressedSequence = CompressedSequence(data);
322         compressedSequence.numBases = fastaString.length;
323 
324         return compressedSequence;
325     }
326 
327     unittest
328     {
329         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
330         auto cs = CompressedSequence.from(testSequence);
331 
332         static foreach (i, base; testSequence)
333         {
334             mixin("assert(cs[" ~ i.stringof ~ "] == CompressedBase." ~ base.toLower.to!char ~ ");");
335         }
336     }
337 
338     @property const(CompressedBaseQuad[]) data() const pure nothrow
339     {
340         return _data;
341     }
342 
343     @property auto bases(T : CompressedBase, Flag!"reverse" reverse = No.reverse)() const pure nothrow
344     {
345         static if (reverse)
346             return _data
347                 .retro
348                 .map!"a[]"
349                 .map!retro
350                 .joiner
351                 .dropExactly(4 * _data.length - length - _baseOffset)
352                 .takeExactly(length);
353         else
354             return _data
355                 .map!"a[]"
356                 .joiner
357                 .dropExactly(_baseOffset)
358                 .takeExactly(length);
359     }
360 
361     unittest
362     {
363         import std.algorithm : equal;
364 
365         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
366         auto cs = CompressedSequence.from(testSequence);
367 
368         with (CompressedBase)
369         {
370             assert(cs.bases!CompressedBase.equal([
371                 a, t, g, c, c, a, a, c, t, a, c, t, t, t, g, a, a, c, g, c, g, c,
372                 c, g, c, a, a, g, g, c, a, c, a, g, g, t, g, c, g, c, c, t,
373             ]));
374             assert(cs.bases!(CompressedBase, Yes.reverse).equal([
375                 t, c, c, g, c, g, t, g, g, a, c, a, c, g, g, a, a, c, g, c, c, g, c, g, c, a, a,
376                 g, t, t, t, c, a, t, c, a, a, c, c, g, t, a,
377             ]));
378         }
379     }
380 
381     @property auto bases(C, Flag!"reverse" reverse = No.reverse)() const pure nothrow if (isSomeChar!C)
382     {
383         return bases!(CompressedBase, reverse).map!(cb => convert!C(cb));
384     }
385 
386     unittest
387     {
388         import std.algorithm : equal;
389 
390         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
391         auto cs = CompressedSequence.from(testSequence);
392 
393         assert(cs.bases!dchar.equal(testSequence.toLower));
394         assert(cs.bases!(dchar, Yes.reverse).equal(testSequence.toLower.retro));
395     }
396 
397     S to(S)() const pure nothrow if (isSomeString!S)
398     {
399         cast(void) is(S == Char[], Char);
400         alias C = Unqual!Char;
401         auto fastaSequence = minimallyInitializedArray!(C[])(length);
402 
403         foreach (fastaIdx; 0 .. length)
404         {
405             fastaSequence[fastaIdx] = convert!C(this[fastaIdx]);
406         }
407 
408         return cast(S) fastaSequence;
409     }
410 
411     alias toString = to!string;
412 
413     unittest
414     {
415         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
416         auto cs = CompressedSequence.from(testSequence);
417 
418         assert(cs.to!string == testSequence.toLower);
419     }
420 
421     @property ubyte baseOffset() const pure nothrow
422     {
423         return _baseOffset;
424     }
425 
426     @property size_t length() const pure nothrow
427     {
428         return numBases;
429     }
430 
431     alias opDollar = length;
432 
433     unittest
434     {
435         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
436         auto cs = CompressedSequence.from(testSequence);
437 
438         assert(cs.length == testSequence.length);
439         assert(cs[1 .. $ - 2].length == testSequence.length - 3);
440     }
441 
442     CompressedBase opIndex(size_t fastaIdx) const pure nothrow
443     {
444         assert(fastaIdx < length, "index out o bounds" ~ fastaIdx.to!string);
445 
446         fastaIdx += _baseOffset;
447         auto i = fastaIdx / 4;
448         auto j = fastaIdx % 4;
449 
450         return data[i][j];
451     }
452 
453     unittest
454     {
455         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
456         auto cs = CompressedSequence.from(testSequence);
457 
458         assert(cs[0] == CompressedBase.a);
459         assert(cs[1] == CompressedBase.t);
460         assert(cs[2] == CompressedBase.g);
461         assert(cs[3] == CompressedBase.c);
462         assert(cs[$ - 4] == CompressedBase.g);
463         assert(cs[$ - 3] == CompressedBase.c);
464         assert(cs[$ - 2] == CompressedBase.c);
465         assert(cs[$ - 1] == CompressedBase.t);
466 
467         assert(cs[1 .. $ - 2][0] == CompressedBase.t);
468         assert(cs[1 .. $ - 2][1] == CompressedBase.g);
469         assert(cs[1 .. $ - 2][2] == CompressedBase.c);
470         assert(cs[1 .. $ - 2][3] == CompressedBase.c);
471         assert(cs[1 .. $ - 2][$ - 4] == CompressedBase.g);
472         assert(cs[1 .. $ - 2][$ - 3] == CompressedBase.c);
473         assert(cs[1 .. $ - 2][$ - 2] == CompressedBase.g);
474         assert(cs[1 .. $ - 2][$ - 1] == CompressedBase.c);
475     }
476 
477     inout(CompressedSequence) opIndex(size_t[2] slice) inout pure nothrow
478     {
479         assert(0 <= slice[0] && slice[0] <= slice[1] && slice[1] <= length, "index out of bounds");
480 
481 
482         if (slice[1] == slice[0])
483             return typeof(return)();
484 
485         size_t dataStart = slice[0] / 4;
486         size_t dataEnd = ceildiv(slice[1], 4);
487 
488         return typeof(return)(
489             this._data[dataStart .. dataEnd],
490             cast(ubyte) slice[0] % 4,
491             slice[1] - slice[0],
492         );
493     }
494 
495     size_t[2] opSlice(size_t dim)(in size_t start, in size_t end) const pure nothrow if (dim == 0)
496     {
497         return [start, end];
498     }
499 
500     unittest
501     {
502         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
503         auto cs = CompressedSequence.from(testSequence);
504 
505         assert(cs[1 .. $ - 2].to!string == testSequence[1 .. $ - 2].toLower);
506         assert(cs[0 .. $].to!string == testSequence.toLower);
507         assert(cs[0 .. $] == cs);
508     }
509 
510     @property size_t compressedLength() const pure nothrow
511     {
512         return data.length;
513     }
514 
515     unittest
516     {
517         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
518         auto cs = CompressedSequence.from(testSequence);
519 
520         assert(cs.compressedLength == 11);
521     }
522 
523     static size_t compressedLength(S)(in S fastaSequence) if (isSomeString!S)
524     {
525         assert(canConvert(fastaSequence));
526 
527         return ceildiv(fastaSequence.length, 4);
528     }
529 
530     unittest
531     {
532         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
533 
534         assert(compressedLength(testSequence) == 11);
535     }
536 
537     static size_t canConvert(S)(in S fastaSequence) if (isSomeString!S)
538     {
539         return fastaSequence.all!(c => c.toLower.among!('a', 'c', 'g', 't'));
540     }
541 
542     unittest
543     {
544         enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT";
545         enum falseSequence = "atgccaactactttgaNNNNNnnnnnAGGCACAGGTGCGCCT";
546 
547         assert(canConvert(testSequence));
548         assert(!canConvert(falseSequence));
549     }
550 
551     static CompressedBase convert(C)(in C fastaBase) if (isSomeChar!C)
552     {
553         switch (fastaBase.toLower)
554         {
555             case 'a':
556                 return CompressedBase.a;
557             case 'c':
558                 return CompressedBase.c;
559             case 't':
560                 return CompressedBase.t;
561             case 'g':
562                 return CompressedBase.g;
563             default:
564                 throw new Exception("invalid base: " ~ fastaBase.to!string);
565         }
566     }
567 
568     unittest
569     {
570         enum bases = "actgACTG";
571 
572         static foreach (i, base; bases)
573         {
574             mixin("assert(convert(base) == CompressedBase." ~ base.toLower.to!char ~ ");");
575         }
576     }
577 
578     static C convert(C)(in CompressedBase compressedBase)
579     {
580         final switch (compressedBase)
581         {
582             case CompressedBase.a:
583                 return 'a';
584             case CompressedBase.c:
585                 return 'c';
586             case CompressedBase.t:
587                 return 't';
588             case CompressedBase.g:
589                 return 'g';
590         }
591     }
592 
593     unittest
594     {
595         enum bases = "actgACTG";
596 
597         static foreach (i, base; bases)
598         {
599             mixin("assert(convert!dchar(CompressedBase." ~ base.toLower.to!char ~ ") == base.toLower);");
600         }
601     }
602 };
603