digest
Loading...
Searching...
No Matches
digester.hpp
1#ifndef DIGESTER_HPP
2#define DIGESTER_HPP
3
4#include <cstdint>
5#include <deque>
6#include <nthash/kmer.hpp>
7#include <nthash/nthash.hpp>
8
12namespace digest {
13
20class BadConstructionException : public std::exception {
21 const char *what() const throw() {
22 return "k must be greater than 3, start must be less than len";
23 }
24};
25
31class NotRolledTillEndException : public std::exception {
32 const char *what() const throw() {
33 return "Iterator must be at the end of the current sequence before "
34 "appending a new one.";
35 }
36};
37
43 CANON,
45 FORWARD,
48};
49
53enum class BadCharPolicy {
66};
67
75template <BadCharPolicy P> class Digester {
76 public:
89 Digester(const char *seq, size_t len, unsigned k, size_t start = 0,
91 : seq(seq), len(len), offset(0), start(start), end(start + k), chash(0),
92 fhash(0), rhash(0), k(k), minimized_h(minimized_h) {
93 if (k < 4 or start >= len or (int) minimized_h > 2) {
95 }
96 init_hash();
97 }
98
109 Digester(const std::string &seq, unsigned k, size_t start = 0,
111 : Digester(seq.c_str(), seq.size(), k, start, minimized_h) {}
112
113 virtual ~Digester() = default;
114
120 bool get_is_valid_hash() { return is_valid_hash; }
121
125 unsigned get_k() { return k; }
126
130 size_t get_len() { return len; }
131
138 bool roll_one() {
139 if (P == BadCharPolicy::SKIPOVER) {
140 return roll_one_skip_over();
141 } else {
142 return roll_one_write_over();
143 }
144 };
145
154 virtual void roll_minimizer(unsigned amount,
155 std::vector<uint32_t> &vec) = 0;
156
165 virtual void
166 roll_minimizer(unsigned amount,
167 std::vector<std::pair<uint32_t, uint32_t>> &vec) = 0;
168
177 size_t get_pos() { return offset + start - c_outs.size(); }
178
184 uint64_t get_chash() { return chash; }
185
190 uint64_t get_fhash() { return fhash; }
191
196 uint64_t get_rhash() { return rhash; }
197
209 virtual void new_seq(const char *seq, size_t len, size_t start) {
210 this->seq = seq;
211 this->len = len;
212 this->offset = 0;
213 this->start = start;
214 this->end = start + this->k;
215 is_valid_hash = false;
216 if (start >= len) {
218 }
219 init_hash();
220 }
221
232 virtual void new_seq(const std::string &seq, size_t pos) {
233 new_seq(seq.c_str(), seq.size(), pos);
234 }
235
253 void append_seq(const char *seq, size_t len) {
254 if (P == BadCharPolicy::SKIPOVER) {
255 append_seq_skip_over(seq, len);
256 } else {
257 append_seq_write_over(seq, len);
258 }
259 }
260
277 void append_seq(const std::string &seq) {
278 if (P == BadCharPolicy::SKIPOVER) {
279 append_seq_skip_over(seq.c_str(), seq.size());
280 } else {
281 append_seq_write_over(seq.c_str(), seq.size());
282 }
283 }
284
289 MinimizedHashType get_minimized_h() { return minimized_h; }
290
294 const char *get_sequence() { return seq; }
295
296 protected:
297 // 0x41 = 'A', 0x43 = 'C', 0x47 = 'G' 0x54 = 'T'
298 // 0x61 = 'a', 0x63 = 'c', 0x67 = 'g' 0x74 = 't'
299 std::array<bool, 256> actg{
300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
304 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
305 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
306 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
307 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
310 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // NOLINT
316 };
317
327 bool is_ACTG(char in) { return actg[in]; }
328
339 bool init_hash() {
340 if (P == BadCharPolicy::SKIPOVER) {
341 return init_hash_skip_over();
342 } else {
343 return init_hash_write_over();
344 }
345 };
346
347 void append_seq_skip_over(const char *seq, size_t len) {
348 if (end < this->len) {
349 throw NotRolledTillEndException();
350 }
351 offset += this->len;
352 size_t ind = this->len - 1;
353
354 /*
355 this is for the case where we call append_seq after having
356 previously called append_seq and not having gotten through the deque
357 In such a case, since append_seq initializes a hash, we need to get
358 rid of the first character in the deque since if we just initialized
359 the hash without doing this, it would be identical the the current
360 hash held by the object
361
362 However, there is also the case that a hash was never previously
363 initialized, such as when the length of the string used in the
364 previous append_seq call, plus the the amount of ACTG characters
365 after the last non-ACTG character in the original string summed to be
366 less than k In this case, it would not be correct to remove the first
367 character in the deque
368 */
369 if ((start != end || c_outs.size() == k) && c_outs.size() > 0) {
370 c_outs.pop_front();
371 }
372
373 // the following copies in characters from the end of the old sequence
374 // into the deque
375 std::vector<char> temp_vec;
376 while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) {
377 if (!is_ACTG(this->seq[ind]))
378 break;
379
380 temp_vec.push_back(this->seq[ind]);
381 if (ind == 0)
382 break;
383
384 ind--;
385 }
386 for (std::vector<char>::reverse_iterator rit = temp_vec.rbegin();
387 rit != temp_vec.rend(); rit++) {
388 c_outs.push_back(*rit);
389 }
390
391 // the following copies in characters from the front of the new sequence
392 // if there weren't enough non-ACTG characters at the end of the old
393 // sequence
394 ind = 0;
395 start = 0;
396 end = 0;
397 while (c_outs.size() < k && ind < len) {
398 if (!is_ACTG(seq[ind])) {
399 start = ind + 1;
400 end = start + k;
401 this->seq = seq;
402 this->len = len;
403 c_outs.clear();
404 init_hash();
405 break;
406 }
407 c_outs.push_back(seq[ind]);
408 ind++;
409 start++;
410 end++;
411 }
412
413 // the following initializes a hash if we managed to fill the deque
414 if (c_outs.size() == k) {
415 std::string temp(c_outs.begin(), c_outs.end());
416 // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash,
417 // locn_useless);
418 fhash = base_forward_hash(temp.c_str(), k);
419 rhash = base_reverse_hash(temp.c_str(), k);
420 chash = nthash::canonical(fhash, rhash);
421 is_valid_hash = true;
422 }
423 this->seq = seq;
424 this->len = len;
425 }
426
427 void append_seq_write_over(const char *seq, size_t len) {
428 if (end < this->len) {
429 throw NotRolledTillEndException();
430 }
431 offset += this->len;
432 size_t ind = this->len - 1;
433
434 if ((start != end || c_outs.size() == k) && c_outs.size() > 0) {
435 c_outs.pop_front();
436 }
437
438 // the following copies in characters from the end of the old sequence
439 // into the deque
440 std::vector<char> temp_vec;
441 while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) {
442 if (!is_ACTG(this->seq[ind])) {
443 temp_vec.push_back('A');
444 } else {
445 temp_vec.push_back(this->seq[ind]);
446 }
447 if (ind == 0)
448 break;
449
450 ind--;
451 }
452 for (std::vector<char>::reverse_iterator rit = temp_vec.rbegin();
453 rit != temp_vec.rend(); rit++) {
454 c_outs.push_back(*rit);
455 }
456
457 // the following copies in characters from the front of the new sequence
458 // if there weren't enough non-ACTG characters at the end of the old
459 // sequence
460 ind = 0;
461 start = 0;
462 end = 0;
463 while (c_outs.size() < k && ind < len) {
464 if (!is_ACTG(seq[ind])) {
465 c_outs.push_back('A');
466 } else {
467 c_outs.push_back(seq[ind]);
468 }
469
470 ind++;
471 start++;
472 end++;
473 }
474
475 // the following initializes a hash if we managed to fill the deque
476 if (c_outs.size() == k) {
477 std::string temp(c_outs.begin(), c_outs.end());
478 // nthash::ntc64(temp.c_str(), k, fhash, rhash, chash,
479 // locn_useless);
480 fhash = base_forward_hash(temp.c_str(), k);
481 rhash = base_reverse_hash(temp.c_str(), k);
482 chash = nthash::canonical(fhash, rhash);
483 is_valid_hash = true;
484 }
485 this->seq = seq;
486 this->len = len;
487 }
488
489 bool init_hash_skip_over() {
490 c_outs.clear();
491 while (end - 1 < len) {
492 bool works = true;
493 for (size_t i = start; i < end; i++) {
494 if (!is_ACTG(seq[i])) {
495 start = i + 1;
496 end = start + k;
497 works = false;
498 break;
499 }
500 }
501 if (!works) {
502 continue;
503 }
504 // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless);
505 fhash = base_forward_hash(seq + start, k);
506 rhash = base_reverse_hash(seq + start, k);
507 chash = nthash::canonical(fhash, rhash);
508 is_valid_hash = true;
509 return true;
510 }
511 is_valid_hash = false;
512 return false;
513 }
514
515 // need to do a good bit of rewriting
516 // not performance critical so it's kinda whatever
517 bool init_hash_write_over() {
518 c_outs.clear();
519 while (end - 1 < len) {
520 std::string init_str;
521 for (size_t i = start; i < end; i++) {
522 if (!is_ACTG(seq[i])) {
523 init_str.push_back('A');
524 } else {
525 init_str.push_back(seq[i]);
526 }
527 }
528
529 // nthash::ntc64(seq + start, k, fhash, rhash, chash, locn_useless);
530 fhash = base_forward_hash(init_str.c_str(), k);
531 rhash = base_reverse_hash(init_str.c_str(), k);
532 chash = nthash::canonical(fhash, rhash);
533 is_valid_hash = true;
534 return true;
535 }
536 is_valid_hash = false;
537 return false;
538 }
539
540 bool roll_one_skip_over() {
541 if (!is_valid_hash) {
542 return false;
543 }
544 if (end >= len) {
545 is_valid_hash = false;
546 return false;
547 }
548 if (c_outs.size() > 0) {
549 if (is_ACTG(seq[end])) {
550 fhash = next_forward_hash(fhash, k, c_outs.front(), seq[end]);
551 rhash = next_reverse_hash(rhash, k, c_outs.front(), seq[end]);
552 c_outs.pop_front();
553 end++;
554 chash = nthash::canonical(fhash, rhash);
555 return true;
556 } else {
557 // c_outs will contain at most k-1 characters, so if we jump to
558 // end + 1, we won't consider anything else in deque so we
559 // should clear it
560 c_outs.clear();
561 start = end + 1;
562 end = start + k;
563 return init_hash();
564 }
565 } else {
566 if (is_ACTG(seq[end])) {
567 fhash = next_forward_hash(fhash, k, seq[start], seq[end]);
568 rhash = next_reverse_hash(rhash, k, seq[start], seq[end]);
569 start++;
570 end++;
571 chash = nthash::canonical(fhash, rhash);
572 return true;
573 } else {
574 start = end + 1;
575 end = start + k;
576 return init_hash();
577 }
578 }
579 }
580
581 bool roll_one_write_over() {
582 if (!is_valid_hash) {
583 return false;
584 }
585 if (end >= len) {
586 is_valid_hash = false;
587 return false;
588 }
589 char next_char = is_ACTG(seq[end]) ? seq[end] : 'A';
590 if (c_outs.size() > 0) {
591 fhash = next_forward_hash(fhash, k, c_outs.front(), next_char);
592 rhash = next_reverse_hash(rhash, k, c_outs.front(), next_char);
593 c_outs.pop_front();
594 end++;
595
596 } else {
597 char out_char = is_ACTG(seq[start]) ? seq[start] : 'A';
598 fhash = next_forward_hash(fhash, k, out_char, next_char);
599 rhash = next_reverse_hash(rhash, k, out_char, next_char);
600 start++;
601 end++;
602 }
603 chash = nthash::canonical(fhash, rhash);
604 return true;
605 }
606
607 // sequence to be digested, memory is owned by the user
608 const char *seq;
609
610 // length of seq
611 size_t len;
612
613 // the combined length of all the previous strings that have been appended
614 // together, not counting the current string
615 size_t offset;
616
617 // internal index of the next character to be thrown out, junk if c_outs is
618 // not empty
619 size_t start;
620
621 // internal index of next character to be added
622 size_t end;
623
624 // canonical hash of current k-mer
625 uint64_t chash;
626
627 // forward hash of current k-mer
628 uint64_t fhash;
629
630 // reverse hash of current k-mer
631 uint64_t rhash;
632
633 // length of kmer
634 unsigned k;
635
636 // deque of characters to be rolled out in the rolling hash from left to
637 // right
638 std::deque<char> c_outs;
639
640 // Hash value to be minimized, 0 for canonical, 1 for forward, 2 for reverse
641 MinimizedHashType minimized_h;
642
643 // bool representing whether the current hash is meaningful, i.e.
644 // corresponds to the k-mer at get_pos()
645 bool is_valid_hash = false;
646};
647
648} // namespace digest
649
650#endif // DIGESTER_HPP
Exception thrown when initializing a Digister with k (kmer size) < 4 and start (starting index) < len...
Definition digester.hpp:20
an abstract class for Digester objects.
Definition digester.hpp:75
virtual void new_seq(const std::string &seq, size_t pos)
replaces the current sequence with the new one. It's like starting over with a completely new sequenc...
Definition digester.hpp:232
virtual void roll_minimizer(unsigned amount, std::vector< uint32_t > &vec)=0
gets the positions, as defined by get_pos(), of minimizers up to the amount specified
void append_seq(const char *seq, size_t len)
simulates the appending of a new sequence to the end of the old sequence. The old sequence will no lo...
Definition digester.hpp:253
bool roll_one()
moves the internal pointer to the next valid k-mer. Time Complexity: O(1)
Definition digester.hpp:138
Digester(const std::string &seq, unsigned k, size_t start=0, MinimizedHashType minimized_h=MinimizedHashType::CANON)
Definition digester.hpp:109
Digester(const char *seq, size_t len, unsigned k, size_t start=0, MinimizedHashType minimized_h=MinimizedHashType::CANON)
Definition digester.hpp:89
bool get_is_valid_hash()
Definition digester.hpp:120
MinimizedHashType get_minimized_h()
Definition digester.hpp:289
uint64_t get_rhash()
Definition digester.hpp:196
uint64_t get_chash()
Definition digester.hpp:184
unsigned get_k()
Definition digester.hpp:125
virtual void roll_minimizer(unsigned amount, std::vector< std::pair< uint32_t, uint32_t > > &vec)=0
gets the positions (pair.first), as defined by get_pos(), and the hashes (pair.second) of minimizers ...
void append_seq(const std::string &seq)
simulates the appending of a new sequence to the end of the old sequence. The old sequence will no lo...
Definition digester.hpp:277
size_t get_pos()
Definition digester.hpp:177
const char * get_sequence()
Definition digester.hpp:294
size_t get_len()
Definition digester.hpp:130
uint64_t get_fhash()
Definition digester.hpp:190
virtual void new_seq(const char *seq, size_t len, size_t start)
replaces the current sequence with the new one. It's like starting over with a completely new seqeunc...
Definition digester.hpp:209
Exception thrown when append_seq() is called before all kmers/large windows in the current sequence h...
Definition digester.hpp:31
digest code.
Definition data_structure.hpp:27
BadCharPolicy
Specifies behavior with non-ACTG characters.
Definition digester.hpp:53
MinimizedHashType
Enum values for the type of hash to minimize.
Definition digester.hpp:41