89 Digester(
const char *seq,
size_t len,
unsigned k,
size_t start = 0,
91 : seq(seq), len(len), offset(0), start(start), end(start + k), chash(0),
92 fhash(0), rhash(0), k(k), minimized_h(minimized_h) {
93 if (k < 4 or start >= len or (
int) minimized_h > 2) {
109 Digester(
const std::string &seq,
unsigned k,
size_t start = 0,
111 :
Digester(seq.c_str(), seq.size(), k, start, minimized_h) {}
140 return roll_one_skip_over();
142 return roll_one_write_over();
155 std::vector<uint32_t> &vec) = 0;
167 std::vector<std::pair<uint32_t, uint32_t>> &vec) = 0;
177 size_t get_pos() {
return offset + start - c_outs.size(); }
209 virtual void new_seq(
const char *seq,
size_t len,
size_t start) {
214 this->end = start + this->k;
215 is_valid_hash =
false;
232 virtual void new_seq(
const std::string &seq,
size_t pos) {
233 new_seq(seq.c_str(), seq.size(), pos);
255 append_seq_skip_over(seq, len);
257 append_seq_write_over(seq, len);
279 append_seq_skip_over(seq.c_str(), seq.size());
281 append_seq_write_over(seq.c_str(), seq.size());
299 std::array<bool, 256> actg{
300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 bool is_ACTG(
char in) {
return actg[in]; }
341 return init_hash_skip_over();
343 return init_hash_write_over();
347 void append_seq_skip_over(
const char *seq,
size_t len) {
348 if (end < this->len) {
349 throw NotRolledTillEndException();
352 size_t ind = this->len - 1;
369 if ((start != end || c_outs.size() == k) && c_outs.size() > 0) {
375 std::vector<char> temp_vec;
376 while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) {
377 if (!is_ACTG(this->seq[ind]))
380 temp_vec.push_back(this->seq[ind]);
386 for (std::vector<char>::reverse_iterator rit = temp_vec.rbegin();
387 rit != temp_vec.rend(); rit++) {
388 c_outs.push_back(*rit);
397 while (c_outs.size() < k && ind < len) {
398 if (!is_ACTG(seq[ind])) {
407 c_outs.push_back(seq[ind]);
414 if (c_outs.size() == k) {
415 std::string temp(c_outs.begin(), c_outs.end());
418 fhash = base_forward_hash(temp.c_str(), k);
419 rhash = base_reverse_hash(temp.c_str(), k);
420 chash = nthash::canonical(fhash, rhash);
421 is_valid_hash =
true;
427 void append_seq_write_over(
const char *seq,
size_t len) {
428 if (end < this->len) {
429 throw NotRolledTillEndException();
432 size_t ind = this->len - 1;
434 if ((start != end || c_outs.size() == k) && c_outs.size() > 0) {
440 std::vector<char> temp_vec;
441 while (temp_vec.size() + c_outs.size() < k - 1 && ind >= start) {
442 if (!is_ACTG(this->seq[ind])) {
443 temp_vec.push_back(
'A');
445 temp_vec.push_back(this->seq[ind]);
452 for (std::vector<char>::reverse_iterator rit = temp_vec.rbegin();
453 rit != temp_vec.rend(); rit++) {
454 c_outs.push_back(*rit);
463 while (c_outs.size() < k && ind < len) {
464 if (!is_ACTG(seq[ind])) {
465 c_outs.push_back(
'A');
467 c_outs.push_back(seq[ind]);
476 if (c_outs.size() == k) {
477 std::string temp(c_outs.begin(), c_outs.end());
480 fhash = base_forward_hash(temp.c_str(), k);
481 rhash = base_reverse_hash(temp.c_str(), k);
482 chash = nthash::canonical(fhash, rhash);
483 is_valid_hash =
true;
489 bool init_hash_skip_over() {
491 while (end - 1 < len) {
493 for (
size_t i = start; i < end; i++) {
494 if (!is_ACTG(seq[i])) {
505 fhash = base_forward_hash(seq + start, k);
506 rhash = base_reverse_hash(seq + start, k);
507 chash = nthash::canonical(fhash, rhash);
508 is_valid_hash =
true;
511 is_valid_hash =
false;
517 bool init_hash_write_over() {
519 while (end - 1 < len) {
520 std::string init_str;
521 for (
size_t i = start; i < end; i++) {
522 if (!is_ACTG(seq[i])) {
523 init_str.push_back(
'A');
525 init_str.push_back(seq[i]);
530 fhash = base_forward_hash(init_str.c_str(), k);
531 rhash = base_reverse_hash(init_str.c_str(), k);
532 chash = nthash::canonical(fhash, rhash);
533 is_valid_hash =
true;
536 is_valid_hash =
false;
540 bool roll_one_skip_over() {
541 if (!is_valid_hash) {
545 is_valid_hash =
false;
548 if (c_outs.size() > 0) {
549 if (is_ACTG(seq[end])) {
550 fhash = next_forward_hash(fhash, k, c_outs.front(), seq[end]);
551 rhash = next_reverse_hash(rhash, k, c_outs.front(), seq[end]);
554 chash = nthash::canonical(fhash, rhash);
566 if (is_ACTG(seq[end])) {
567 fhash = next_forward_hash(fhash, k, seq[start], seq[end]);
568 rhash = next_reverse_hash(rhash, k, seq[start], seq[end]);
571 chash = nthash::canonical(fhash, rhash);
581 bool roll_one_write_over() {
582 if (!is_valid_hash) {
586 is_valid_hash =
false;
589 char next_char = is_ACTG(seq[end]) ? seq[end] :
'A';
590 if (c_outs.size() > 0) {
591 fhash = next_forward_hash(fhash, k, c_outs.front(), next_char);
592 rhash = next_reverse_hash(rhash, k, c_outs.front(), next_char);
597 char out_char = is_ACTG(seq[start]) ? seq[start] :
'A';
598 fhash = next_forward_hash(fhash, k, out_char, next_char);
599 rhash = next_reverse_hash(rhash, k, out_char, next_char);
603 chash = nthash::canonical(fhash, rhash);
638 std::deque<char> c_outs;
645 bool is_valid_hash =
false;