1 /** 2 Copyright: © 2018 Arne Ludwig <arne.ludwig@posteo.de> 3 License: Subject to the terms of the MIT license, as written in the 4 included LICENSE file. 5 Authors: Arne Ludwig <arne.ludwig@posteo.de> 6 */ 7 module dentist.common.binio._base; 8 9 import dentist.util.log; 10 import dentist.util.math : ceil, ceildiv; 11 import std.algorithm : 12 all, 13 among, 14 joiner, 15 map, 16 permutations; 17 import std.array : minimallyInitializedArray; 18 import std.bitmanip : bitfields; 19 import std.conv : to; 20 import std.exception : enforce, ErrnoException; 21 import std.format : format; 22 import std.range : 23 chunks, 24 dropExactly, 25 empty, 26 enumerate, 27 front, 28 popFront, 29 retro, 30 takeExactly; 31 import std.stdio : File; 32 import std..string : toLower; 33 import std.traits : 34 isArray, 35 isSomeChar, 36 isSomeString, 37 Unqual; 38 import std.typecons : BitFlags, Flag, No, Yes; 39 40 41 void lockIfPossible(ref File file) nothrow 42 { 43 try 44 { 45 file.lock(); 46 } 47 catch (Exception e) 48 { 49 logJsonDebug( 50 "info", "could not lock file `%s`", 51 "file", file.name, 52 "error", e.message().to!string, 53 ); 54 } 55 } 56 57 class BinaryIOException : Exception 58 { 59 pure nothrow @nogc @safe this(string msg, string file = __FILE__, 60 size_t line = __LINE__, Throwable nextInChain = null) 61 { 62 super(msg, file, line, nextInChain); 63 } 64 } 65 66 mixin template DbIndex(T...) 67 { 68 static struct EOF; 69 70 @property auto ref size_t beginPtr(T)() pure nothrow 71 { 72 return fieldPtr!T; 73 } 74 75 @property size_t beginPtr(T)() const pure nothrow 76 { 77 return fieldPtr!T; 78 } 79 80 @property auto ref size_t endPtr(T)() pure nothrow 81 { 82 return fieldPtr!(NextType!T); 83 } 84 85 @property size_t endPtr(T)() const pure nothrow 86 { 87 return fieldPtr!(NextType!T); 88 } 89 90 @property ArrayStorage!(StorageType!T) arrayStorage(T)() const pure nothrow 91 { 92 return typeof(return).fromPtrs(beginPtr!T, endPtr!T); 93 } 94 } 95 96 T readRecordAt(T)(File dbFile, size_t ptr) 97 { 98 dbFile.seek(ptr.to!long); 99 100 return readRecord!T(dbFile); 101 } 102 103 T readRecord(T)(File dbFile) 104 { 105 return readRecords(dbFile, new T[1])[0]; 106 } 107 108 T readRecords(T)(File dbFile, T records) if (isArray!T) 109 { 110 if (records.length == 0) 111 return records; 112 113 auto expectedLength = records.length; 114 try 115 { 116 records = dbFile.rawRead(records); 117 118 enforce!BinaryIOException( 119 records.length == expectedLength, 120 format!"malformed DB `%s`: premature end of file"( 121 dbFile.name) 122 ); 123 } 124 catch (ErrnoException e) 125 { 126 throw new BinaryIOException( 127 format!"malformed DB `%s`: cannot read record of type %s: %s"( 128 dbFile.name, T.stringof, e.msg), 129 ); 130 } 131 132 return records; 133 } 134 135 struct ArrayStorage(T) 136 { 137 enum elementSize = T.sizeof; 138 size_t ptr; 139 size_t length; 140 141 ArrayStorage!T opIndex() const pure nothrow 142 { 143 return this; 144 } 145 146 size_t opIndex(size_t i) const pure nothrow 147 { 148 assert(i <= length, "out of bounds"); 149 150 return ptr + i * elementSize; 151 } 152 153 ArrayStorage!T opIndex(size_t[2] slice) const pure nothrow 154 { 155 auto from = slice[0]; 156 auto to = slice[1]; 157 assert(from < to && to <= length, "out of bounds"); 158 159 return typeof(return)(this[from], to - from); 160 } 161 162 size_t[2] opSlice(size_t dim)(size_t from, size_t to) const pure nothrow 163 if (dim == 0) 164 { 165 assert(from < to, "slice `[from .. to]` must have `from < to`"); 166 167 return [from, to]; 168 } 169 170 size_t opDollar() const pure nothrow 171 { 172 return length; 173 } 174 175 long indexOf(size_t ptr) const pure 176 { 177 assert((ptr.to!long - this.ptr.to!long) % elementSize.to!long == 0, "bad pointer alignment"); 178 179 return (ptr.to!long - this.ptr.to!long) / elementSize.to!long; 180 } 181 182 static ArrayStorage!T fromPtrs(size_t fromPtr, size_t toPtr) 183 { 184 assert((toPtr - fromPtr) % elementSize == 0, "bad pointer alignment"); 185 186 return typeof(return)(fromPtr, (toPtr - fromPtr) / elementSize); 187 } 188 } 189 190 unittest 191 { 192 import core.exception : AssertError; 193 import std.exception : assertThrown; 194 195 alias T = int; 196 static assert(ArrayStorage!T.elementSize == T.sizeof); 197 enum basePtr = 1337; 198 enum length = 42; 199 200 ArrayStorage!T storage; 201 storage.ptr = basePtr; 202 storage.length = length; 203 204 assert(storage[0] == basePtr); 205 assert(storage[13] == basePtr + 13 * T.sizeof); 206 assert(storage[$] == storage[length]); 207 assertThrown!AssertError(storage[length + 1]); 208 209 auto storageCopy = storage[]; 210 assert(storageCopy == storage); 211 --storageCopy.length; 212 assert(storageCopy.length != storage.length); 213 214 storageCopy = storage[0 .. $]; 215 assert(storageCopy == storage); 216 --storageCopy.length; 217 assert(storageCopy.length != storage.length); 218 219 auto storageSlice = storage[1 .. 13]; 220 assert(storageSlice.ptr == storage[1]); 221 assert(storageSlice.length == 12); 222 223 assert(ArrayStorage!T.fromPtrs(storage[1], storage[13]) == storageSlice); 224 225 assert(storage.indexOf(basePtr + 13 * T.sizeof) == 13); 226 assert(storage.indexOf(basePtr - 13 * T.sizeof) == -13); 227 assertThrown!AssertError(storage.indexOf(basePtr + 1)); 228 } 229 230 enum CompressedBase : ubyte 231 { 232 a = 0b00, 233 c = 0b01, 234 t = 0b10, 235 g = 0b11, 236 } 237 238 struct CompressedBaseQuad 239 { 240 private 241 { 242 mixin(bitfields!( 243 CompressedBase, "_0", 2, 244 CompressedBase, "_1", 2, 245 CompressedBase, "_2", 2, 246 CompressedBase, "_3", 2, 247 )); 248 } 249 250 enum length = 4; 251 alias opDollar = length; 252 253 CompressedBase[] opIndex() const pure nothrow 254 { 255 return [_0, _1, _2, _3]; 256 } 257 258 CompressedBase opIndex(size_t i) const pure nothrow 259 { 260 switch (i) 261 { 262 case 0: 263 return _0; 264 case 1: 265 return _1; 266 case 2: 267 return _2; 268 case 3: 269 return _3; 270 default: 271 assert(0, "index out of bounds: " ~ i.to!string); 272 } 273 } 274 275 void opIndexAssign(CompressedBase base, size_t i) pure nothrow 276 { 277 switch (i) 278 { 279 case 0: 280 _0 = base; 281 return; 282 case 1: 283 _1 = base; 284 return; 285 case 2: 286 _2 = base; 287 return; 288 case 3: 289 _3 = base; 290 return; 291 default: 292 assert(0, "index out of bounds: " ~ i.to!string); 293 } 294 } 295 } 296 297 struct CompressedSequence 298 { 299 private CompressedBaseQuad[] _data; 300 private ubyte _baseOffset; 301 private size_t numBases; 302 303 static CompressedSequence from(S)(S fastaString) if (isSomeString!S) 304 { 305 auto compressedLength = ceil(fastaString.length, 4) / 4; 306 auto data = minimallyInitializedArray!(CompressedBaseQuad[])(compressedLength); 307 308 foreach (i, baseQuad; fastaString.chunks(4).enumerate) 309 { 310 static foreach (j; 0 .. 4) 311 {{ 312 if (!baseQuad.empty) 313 { 314 data[i][j] = convert(baseQuad.front); 315 baseQuad.popFront(); 316 } 317 318 }} 319 } 320 321 auto compressedSequence = CompressedSequence(data); 322 compressedSequence.numBases = fastaString.length; 323 324 return compressedSequence; 325 } 326 327 unittest 328 { 329 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 330 auto cs = CompressedSequence.from(testSequence); 331 332 static foreach (i, base; testSequence) 333 { 334 mixin("assert(cs[" ~ i.stringof ~ "] == CompressedBase." ~ base.toLower.to!char ~ ");"); 335 } 336 } 337 338 @property const(CompressedBaseQuad[]) data() const pure nothrow 339 { 340 return _data; 341 } 342 343 @property auto bases(T : CompressedBase, Flag!"reverse" reverse = No.reverse)() const pure nothrow 344 { 345 static if (reverse) 346 return _data 347 .retro 348 .map!"a[]" 349 .map!retro 350 .joiner 351 .dropExactly(4 * _data.length - length - _baseOffset) 352 .takeExactly(length); 353 else 354 return _data 355 .map!"a[]" 356 .joiner 357 .dropExactly(_baseOffset) 358 .takeExactly(length); 359 } 360 361 unittest 362 { 363 import std.algorithm : equal; 364 365 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 366 auto cs = CompressedSequence.from(testSequence); 367 368 with (CompressedBase) 369 { 370 assert(cs.bases!CompressedBase.equal([ 371 a, t, g, c, c, a, a, c, t, a, c, t, t, t, g, a, a, c, g, c, g, c, 372 c, g, c, a, a, g, g, c, a, c, a, g, g, t, g, c, g, c, c, t, 373 ])); 374 assert(cs.bases!(CompressedBase, Yes.reverse).equal([ 375 t, c, c, g, c, g, t, g, g, a, c, a, c, g, g, a, a, c, g, c, c, g, c, g, c, a, a, 376 g, t, t, t, c, a, t, c, a, a, c, c, g, t, a, 377 ])); 378 } 379 } 380 381 @property auto bases(C, Flag!"reverse" reverse = No.reverse)() const pure nothrow if (isSomeChar!C) 382 { 383 return bases!(CompressedBase, reverse).map!(cb => convert!C(cb)); 384 } 385 386 unittest 387 { 388 import std.algorithm : equal; 389 390 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 391 auto cs = CompressedSequence.from(testSequence); 392 393 assert(cs.bases!dchar.equal(testSequence.toLower)); 394 assert(cs.bases!(dchar, Yes.reverse).equal(testSequence.toLower.retro)); 395 } 396 397 S to(S)() const pure nothrow if (isSomeString!S) 398 { 399 cast(void) is(S == Char[], Char); 400 alias C = Unqual!Char; 401 auto fastaSequence = minimallyInitializedArray!(C[])(length); 402 403 foreach (fastaIdx; 0 .. length) 404 { 405 fastaSequence[fastaIdx] = convert!C(this[fastaIdx]); 406 } 407 408 return cast(S) fastaSequence; 409 } 410 411 alias toString = to!string; 412 413 unittest 414 { 415 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 416 auto cs = CompressedSequence.from(testSequence); 417 418 assert(cs.to!string == testSequence.toLower); 419 } 420 421 @property ubyte baseOffset() const pure nothrow 422 { 423 return _baseOffset; 424 } 425 426 @property size_t length() const pure nothrow 427 { 428 return numBases; 429 } 430 431 alias opDollar = length; 432 433 unittest 434 { 435 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 436 auto cs = CompressedSequence.from(testSequence); 437 438 assert(cs.length == testSequence.length); 439 assert(cs[1 .. $ - 2].length == testSequence.length - 3); 440 } 441 442 CompressedBase opIndex(size_t fastaIdx) const pure nothrow 443 { 444 assert(fastaIdx < length, "index out o bounds" ~ fastaIdx.to!string); 445 446 fastaIdx += _baseOffset; 447 auto i = fastaIdx / 4; 448 auto j = fastaIdx % 4; 449 450 return data[i][j]; 451 } 452 453 unittest 454 { 455 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 456 auto cs = CompressedSequence.from(testSequence); 457 458 assert(cs[0] == CompressedBase.a); 459 assert(cs[1] == CompressedBase.t); 460 assert(cs[2] == CompressedBase.g); 461 assert(cs[3] == CompressedBase.c); 462 assert(cs[$ - 4] == CompressedBase.g); 463 assert(cs[$ - 3] == CompressedBase.c); 464 assert(cs[$ - 2] == CompressedBase.c); 465 assert(cs[$ - 1] == CompressedBase.t); 466 467 assert(cs[1 .. $ - 2][0] == CompressedBase.t); 468 assert(cs[1 .. $ - 2][1] == CompressedBase.g); 469 assert(cs[1 .. $ - 2][2] == CompressedBase.c); 470 assert(cs[1 .. $ - 2][3] == CompressedBase.c); 471 assert(cs[1 .. $ - 2][$ - 4] == CompressedBase.g); 472 assert(cs[1 .. $ - 2][$ - 3] == CompressedBase.c); 473 assert(cs[1 .. $ - 2][$ - 2] == CompressedBase.g); 474 assert(cs[1 .. $ - 2][$ - 1] == CompressedBase.c); 475 } 476 477 inout(CompressedSequence) opIndex(size_t[2] slice) inout pure nothrow 478 { 479 assert(0 <= slice[0] && slice[0] <= slice[1] && slice[1] <= length, "index out of bounds"); 480 481 482 if (slice[1] == slice[0]) 483 return typeof(return)(); 484 485 size_t dataStart = slice[0] / 4; 486 size_t dataEnd = ceildiv(slice[1], 4); 487 488 return typeof(return)( 489 this._data[dataStart .. dataEnd], 490 cast(ubyte) slice[0] % 4, 491 slice[1] - slice[0], 492 ); 493 } 494 495 size_t[2] opSlice(size_t dim)(in size_t start, in size_t end) const pure nothrow if (dim == 0) 496 { 497 return [start, end]; 498 } 499 500 unittest 501 { 502 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 503 auto cs = CompressedSequence.from(testSequence); 504 505 assert(cs[1 .. $ - 2].to!string == testSequence[1 .. $ - 2].toLower); 506 assert(cs[0 .. $].to!string == testSequence.toLower); 507 assert(cs[0 .. $] == cs); 508 } 509 510 @property size_t compressedLength() const pure nothrow 511 { 512 return data.length; 513 } 514 515 unittest 516 { 517 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 518 auto cs = CompressedSequence.from(testSequence); 519 520 assert(cs.compressedLength == 11); 521 } 522 523 static size_t compressedLength(S)(in S fastaSequence) if (isSomeString!S) 524 { 525 assert(canConvert(fastaSequence)); 526 527 return ceildiv(fastaSequence.length, 4); 528 } 529 530 unittest 531 { 532 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 533 534 assert(compressedLength(testSequence) == 11); 535 } 536 537 static size_t canConvert(S)(in S fastaSequence) if (isSomeString!S) 538 { 539 return fastaSequence.all!(c => c.toLower.among!('a', 'c', 'g', 't')); 540 } 541 542 unittest 543 { 544 enum testSequence = "atgccaactactttgaacgcgCCGCAAGGCACAGGTGCGCCT"; 545 enum falseSequence = "atgccaactactttgaNNNNNnnnnnAGGCACAGGTGCGCCT"; 546 547 assert(canConvert(testSequence)); 548 assert(!canConvert(falseSequence)); 549 } 550 551 static CompressedBase convert(C)(in C fastaBase) if (isSomeChar!C) 552 { 553 switch (fastaBase.toLower) 554 { 555 case 'a': 556 return CompressedBase.a; 557 case 'c': 558 return CompressedBase.c; 559 case 't': 560 return CompressedBase.t; 561 case 'g': 562 return CompressedBase.g; 563 default: 564 throw new Exception("invalid base: " ~ fastaBase.to!string); 565 } 566 } 567 568 unittest 569 { 570 enum bases = "actgACTG"; 571 572 static foreach (i, base; bases) 573 { 574 mixin("assert(convert(base) == CompressedBase." ~ base.toLower.to!char ~ ");"); 575 } 576 } 577 578 static C convert(C)(in CompressedBase compressedBase) 579 { 580 final switch (compressedBase) 581 { 582 case CompressedBase.a: 583 return 'a'; 584 case CompressedBase.c: 585 return 'c'; 586 case CompressedBase.t: 587 return 't'; 588 case CompressedBase.g: 589 return 'g'; 590 } 591 } 592 593 unittest 594 { 595 enum bases = "actgACTG"; 596 597 static foreach (i, base; bases) 598 { 599 mixin("assert(convert!dchar(CompressedBase." ~ base.toLower.to!char ~ ") == base.toLower);"); 600 } 601 } 602 }; 603