diff --git a/README.md b/README.md index 2ddee5e..da93784 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,9 @@ With `-o` option, you can specify the prefix of the output files. It is `./yash. With `-a` option, you can specify a AGP format file to ask YaHS to do scaffolding with the scaffolds in the AGP file as the start point. -With `-r` option, you can specify a range of resultions. It is `50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000` by default and the upper limit is automatically adjusted with the genome size. +With `-r` option, you can specify a range of resultions (in ascending order). It is `10000,20000,50000,100000,200000,500000,1000000,2000000,5000000,10000000,20000000,50000000,100000000,200000000,500000000` by default and the upper limit is automatically adjusted by the genome size. For highly fragmented genome assemblies, you can try to start with higher resultions by adding smaller `-r` values. -With `-e` option, you can specify the restriction enzyme(s) used by the Hi-C experiment. For example, `GATC` for the DpnII restriction enzyme used by the Dovetail Hi-C Kit; `GATC,GANT` and `CGATC,GANTC,CTNAG,TTAA` for Arima genomics 2-enzyme and 4-enzyme protocol, respectively. Sometimes, the specification of enzymes may not change the scaffolding result very much if not make it worse, especically when the base quality of the assembly is not very good, e.g., assembies constructed from noisy long reads. +With `-e` option, you can specify the restriction enzyme(s) used by the Hi-C experiment. For example, `GATC` for the DpnII restriction enzyme used by the Dovetail Hi-C Kit; `GATC,GANT` and `CGATC,GANTC,CTNAG,TTAA` for Arima genomics 2-enzyme and 4-enzyme protocol, respectively. Sometimes, the specification of enzymes may not change the scaffolding result very much if not make it worse, especially when the base quality of the assembly is not very good, e.g., assembies constructed from noisy long reads. With `-l` option, you can specify the minimum contig length included for scaffolding. diff --git a/asset.c b/asset.c index e1bb9fa..301495d 100644 --- a/asset.c +++ b/asset.c @@ -194,3 +194,19 @@ uint64_t linear_scale(uint64_t g, int *scale, uint64_t max_g) return g; } +void write_bin_header(FILE *fo) +{ + int64_t magic_number = BIN_H; + int64_t bin_version = BIN_V; + magic_number |= bin_version; + fwrite(&magic_number, sizeof(int64_t), 1, fo); +} + +int is_valid_bin_header(int64_t n) +{ + int64_t magic_number = BIN_H; + int64_t bin_version = BIN_V; + magic_number |= bin_version; + return n == magic_number; +} + diff --git a/asset.h b/asset.h index 53ef69e..8bb02d3 100644 --- a/asset.h +++ b/asset.h @@ -37,6 +37,9 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define BUFF_SIZE 4096 #define GAP_SZ 200 +#define BIN_H 0x5941485342494E56 +#define BIN_V 0x1 +#define LINK_EVIDENCE "proximity_ligation" #ifdef __cplusplus extern "C" { @@ -51,6 +54,8 @@ int file_copy(char *fin, char *fout); int8_t is_read_pair(const char *rname0, const char *rname1); uint32_t div_ceil(uint64_t x, uint32_t y); uint64_t linear_scale(uint64_t g, int *scale, uint64_t max_g); +void write_bin_header(FILE *fo); +int is_valid_bin_header(int64_t magic_number); #ifdef __cplusplus } #endif diff --git a/break.c b/break.c index f727e52..09b8a97 100644 --- a/break.c +++ b/break.c @@ -78,15 +78,14 @@ void link_mat_destroy(link_mat_t *link_mat) free(link_mat); } -uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution) +uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution, uint8_t mq) { - uint32_t i; FILE *fp; + uint32_t i, m, i0, i1, nb, *link_c; + uint8_t buffer[BUFF_SIZE * 17]; + uint64_t max_len, p0, p1; + int64_t magic_number; long pair_c, intra_c, cum_c; - uint64_t max_len; - uint32_t nb, *link_c; - uint32_t buffer[BUFF_SIZE], m, i0, i1; - uint64_t p0, p1; max_len = 0; for (i = 0; i < dict->n; ++i) @@ -101,21 +100,33 @@ uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double m exit(EXIT_FAILURE); } + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + exit(EXIT_FAILURE); + } + pair_c = 0; intra_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0); + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; + + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0); + if (i0 == i1) { ++link_c[labs((long) p0 - p1) / resolution]; ++intra_c; } + + ++pair_c; } - pair_c += m / 4; - if (m < BUFF_SIZE) { + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 0; break; @@ -130,7 +141,7 @@ uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double m cum_c += link_c[i++]; free(link_c); -#ifdef REMOVE_NOISE +#ifdef DEBUG printf("[I::%s] %ld read pairs processed, intra links: %ld \n", __func__, pair_c, intra_c); #endif return i * resolution; @@ -176,12 +187,13 @@ static void calc_moving_average(int64_t *arr, int32_t n, int32_t a) free(buff); } -link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg) +link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg, uint8_t mq) { FILE *fp; - uint32_t i, j, n; - uint32_t buffer[BUFF_SIZE], m, i0, i1; + uint32_t i, j, m, n, i0, i1; uint64_t p0, p1; + int64_t magic_number; + uint8_t buffer[BUFF_SIZE * 17]; long pair_c, intra_c; fp = fopen(f, "r"); @@ -190,6 +202,12 @@ link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_th exit(EXIT_FAILURE); } + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + exit(EXIT_FAILURE); + } + link_mat_t *link_mat = (link_mat_t *) malloc(sizeof(link_mat_t)); link_mat->b = resolution; link_mat->n = dict->n; @@ -203,22 +221,27 @@ link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_th pair_c = intra_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0); + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; + + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0); if (p0 > p1) SWAP(uint64_t, p0, p1); if (i0 == i1 && p1 - p0 <= dist_thres) { - ++intra_c; link_mat->link[i0].link[(MAX(p0, 1) - 1) / resolution] += 1; link_mat->link[i1].link[(MAX(p1, 1) - 1) / resolution] -= 1; + ++intra_c; } + + ++pair_c; } - pair_c += m / 4; - - if (m < BUFF_SIZE) { + + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 0; break; diff --git a/break.h b/break.h index faf5d3d..bfff100 100644 --- a/break.h +++ b/break.h @@ -60,8 +60,8 @@ extern "C" { link_t *link_init(uint32_t s, uint32_t n); link_mat_t *link_mat_init(asm_dict_t *dict, uint32_t b); -link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg); -uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution); +link_mat_t *link_mat_from_file(const char *f, asm_dict_t *dict, uint32_t dist_thres, uint32_t resolution, double noise, uint32_t move_avg, uint8_t mq); +uint32_t estimate_dist_thres_from_file(const char *f, asm_dict_t *dict, double min_frac, uint32_t resolution, uint8_t mq); void link_mat_destroy(link_mat_t *link_mat); void print_link_mat(link_mat_t *link_mat, asm_dict_t *dict, FILE *fp); bp_t *detect_break_points(link_mat_t *link_mat, uint32_t bin_size, uint32_t merge_size, double fold_thres, uint32_t dual_break_thres, uint32_t *bp_n); diff --git a/graph.c b/graph.c index 67e97bc..09028a4 100644 --- a/graph.c +++ b/graph.c @@ -800,7 +800,7 @@ int search_graph_path(graph_t *g, asm_dict_t *dict, char *out) fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tW\t%s\t%u\t%u\t%c\n", s, len + 1, len + cseg.y, ++t, sd->s[cseg.c >> 1].name, cseg.x + 1, cseg.x + cseg.y, "+-"[(cseg.c & 1) ^ ori]); len += cseg.y; if (k != nseg - 1 || j != qs - 1) { - fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tN\t%d\tscaffold\tyes\tna\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ); + fprintf(agp_out, "scaffold_%u\t%lu\t%lu\t%u\tN\t%d\tscaffold\tyes\t%s\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ, LINK_EVIDENCE); len += GAP_SZ; } } diff --git a/juicer.c b/juicer.c index 7394b92..2b9d6f8 100644 --- a/juicer.c +++ b/juicer.c @@ -42,12 +42,13 @@ static double jc_realtime0; KHASH_SET_INIT_STR(str) -static int make_juicer_pre_file_from_bin(char *f, char *agp, char *fai, int scale, int count_gap, FILE *fo) +static int make_juicer_pre_file_from_bin(char *f, char *agp, char *fai, uint8_t mq, int scale, int count_gap, FILE *fo) { FILE *fp; - uint32_t i, i0, i1; + uint32_t i, i0, i1, m; uint64_t p0, p1; - uint32_t buffer[BUFF_SIZE], m; + int64_t magic_number; + uint8_t buffer[BUFF_SIZE * 17]; long pair_c; sdict_t *sdict = make_sdict_from_index(fai, 0); @@ -58,13 +59,24 @@ static int make_juicer_pre_file_from_bin(char *f, char *agp, char *fai, int scal fprintf(stderr, "[E::%s] cannot open file %s for reading\n", __func__, f); exit(EXIT_FAILURE); } + + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + exit(EXIT_FAILURE); + } pair_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, count_gap); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, count_gap); + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; + + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, count_gap); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, count_gap); + if (i0 == UINT32_MAX || i1 == UINT32_MAX) { fprintf(stderr, "[W::%s] sequence not found \n", __func__); } else { @@ -73,10 +85,11 @@ static int make_juicer_pre_file_from_bin(char *f, char *agp, char *fai, int scal else fprintf(fo, "0\t%s\t%lu\t1\t1\t%s\t%lu\t0\n", dict->s[i1].name, p1 >> scale, dict->s[i0].name, p0 >> scale); } + + ++pair_c; } - pair_c += m / 4; - if (m < BUFF_SIZE) { + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 1; break; @@ -100,6 +113,7 @@ static int make_juicer_pre_file_from_bed(char *f, char *agp, char *fai, uint8_t char cname0[4096], cname1[4096], rname0[4096], rname1[4096]; uint32_t s0, s1, e0, e1, i0, i1; uint64_t p0, p1; + uint8_t q0, q1; int8_t buff; long rec_c, pair_c; @@ -123,16 +137,24 @@ static int make_juicer_pre_file_from_bed(char *f, char *agp, char *fai, uint8_t rec_c = pair_c = 0; buff = 0; while ((read = getline(&line, &ln, fp)) != -1) { + + if (++rec_c % 1000000 == 0) + fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); + if (buff == 0) { - sscanf(line, "%s %u %u %s", cname0, &s0, &e0, rname0); + sscanf(line, "%s %u %u %s %hhu %*s", cname0, &s0, &e0, rname0, &q0); ++buff; } else if (buff == 1) { - sscanf(line, "%s %u %u %s", cname1, &s1, &e1, rname1); + sscanf(line, "%s %u %u %s %hhu %*s", cname1, &s1, &e1, rname1, &q1); if (is_read_pair(rname0, rname1)) { - ++pair_c; + buff = 0; + + if (q0 < mq || q1 < mq) + continue; sd_coordinate_conversion(dict, sd_get(sdict, cname0), s0 / 2 + e0 / 2 + (s0 & 1 && e0 & 1), &i0, &p0, count_gap); sd_coordinate_conversion(dict, sd_get(sdict, cname1), s1 / 2 + e1 / 2 + (s1 & 1 && e1 & 1), &i1, &p1, count_gap); + if (i0 == UINT32_MAX) { k = kh_put(str, hmseq, cname0, &absent); if (absent) { @@ -150,19 +172,19 @@ static int make_juicer_pre_file_from_bed(char *f, char *agp, char *fai, uint8_t fprintf(fo, "0\t%s\t%lu\t0\t1\t%s\t%lu\t1\n", dict->s[i0].name, p0 >> scale, dict->s[i1].name, p1 >> scale); else fprintf(fo, "0\t%s\t%lu\t1\t1\t%s\t%lu\t0\n", dict->s[i1].name, p1 >> scale, dict->s[i0].name, p0 >> scale); + + ++pair_c; } - buff = 0; } else { + buff = 1; + strcpy(cname0, cname1); s0 = s1; e0 = e1; + q0 = q1; strcpy(rname0, rname1); - buff = 1; } } - - if (++rec_c % 1000000 == 0) - fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); } fprintf(stderr, "[I::%s] %ld read pairs processed\n", __func__, pair_c); @@ -195,16 +217,16 @@ static int32_t get_target_end(uint32_t *cigar, int n_cigar, int32_t s) static char *parse_bam_rec(bam1_t *b, bam_header_t *h, uint8_t q, int32_t *s, int32_t *e, char **cname) { - // 0x4 0x100 0x400 0x800 - if (b->core.flag & 0xD04) + // 0x4 0x100 0x200 0x400 0x800 + if (b->core.flag & 0xF04) return 0; *cname = h->target_name[b->core.tid]; if (b->core.qual < q) { *s = -1; *e = -1; } else { - *s = b->core.pos + 1; - *e = get_target_end(bam1_cigar(b), b->core.n_cigar, b->core.pos) + 1; + *s = b->core.pos; + *e = get_target_end(bam1_cigar(b), b->core.n_cigar, b->core.pos); } return strdup(bam1_qname(b)); @@ -212,13 +234,13 @@ static char *parse_bam_rec(bam1_t *b, bam_header_t *h, uint8_t q, int32_t *s, in static char *parse_bam_rec1(bam1_t *b, bam_header_t *h, char **cname0, int32_t *s0, char **cname1, int32_t *s1) { - // 0x4 0x8 0x40 0x100 0x400 0x800 - if (b->core.flag & 0xD4C) + // 0x4 0x8 0x40 0x100 0x200 0x400 0x800 + if (b->core.flag & 0xF4C) return 0; *cname0 = h->target_name[b->core.tid]; - *s0 = b->core.pos + 1; + *s0 = b->core.pos; *cname1 = h->target_name[b->core.mtid]; - *s1 = b->core.mpos + 1; + *s1 = b->core.mpos; return bam1_qname(b); } @@ -263,6 +285,7 @@ static int make_juicer_pre_file_from_bam(char *f, char *agp, char *fai, uint8_t if (so == ORDER_NAME) { // sorted by read names while (bam_read1(fp, b) >= 0 ) { + if (++rec_c % 1000000 == 0) fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); @@ -276,9 +299,9 @@ static int make_juicer_pre_file_from_bam(char *f, char *agp, char *fai, uint8_t if (!rname1) continue; if (strcmp(rname0, rname1) == 0) { - ++pair_c; + buff = 0; - if (s0 > 0 && s1 >0) { + if (s0 >= 0 && s1 >= 0) { sd_coordinate_conversion(dict, sd_get(sdict, cname0), s0 / 2 + e0 / 2 + (s0 & 1 && e0 & 1), &i0, &p0, count_gap); sd_coordinate_conversion(dict, sd_get(sdict, cname1), s1 / 2 + e1 / 2 + (s1 & 1 && e1 & 1), &i1, &p1, count_gap); @@ -299,37 +322,40 @@ static int make_juicer_pre_file_from_bam(char *f, char *agp, char *fai, uint8_t fprintf(fo, "0\t%s\t%lu\t0\t1\t%s\t%lu\t1\n", dict->s[i0].name, p0 >> scale, dict->s[i1].name, p1 >> scale); else fprintf(fo, "0\t%s\t%lu\t1\t1\t%s\t%lu\t0\n", dict->s[i1].name, p1 >> scale, dict->s[i0].name, p0 >> scale); + + ++pair_c; } } + free(rname0); free(rname1); rname0 = 0; rname1 = 0; - buff = 0; } else { + buff = 1; + cname0 = cname1; s0 = s1; e0 = e1; free(rname0); rname0 = rname1; rname1 = 0; - buff = 1; } } } } else { // sorted by coordinates or others if (mq > 0) - fprintf(stderr, "[W::%s] BAM file is not sorted by read name. Filtering by mapping quality %hu suppressed \n", __func__, mq); + fprintf(stderr, "[W::%s] BAM file is not sorted by read name. Filtering by mapping quality %hhu suppressed \n", __func__, mq); while (bam_read1(fp, b) >= 0 ) { + if (++rec_c % 1000000 == 0) fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); if(!parse_bam_rec1(b, h, &cname0, &s0, &cname1, &s1)) continue; - ++pair_c; sd_coordinate_conversion(dict, sd_get(sdict, cname0), s0, &i0, &p0, count_gap); sd_coordinate_conversion(dict, sd_get(sdict, cname1), s1, &i1, &p1, count_gap); @@ -350,6 +376,8 @@ static int make_juicer_pre_file_from_bam(char *f, char *agp, char *fai, uint8_t fprintf(fo, "0\t%s\t%lu\t0\t1\t%s\t%lu\t1\n", dict->s[i0].name, p0 >> scale, dict->s[i1].name, p1 >> scale); else fprintf(fo, "0\t%s\t%lu\t1\t1\t%s\t%lu\t0\n", dict->s[i1].name, p1 >> scale, dict->s[i0].name, p0 >> scale); + + ++pair_c; } } } @@ -611,7 +639,7 @@ static int main_pre(int argc, char *argv[]) ret = make_juicer_pre_file_from_bed(link_file, agp1, fai, mq8, scale, !asm_mode, fo); } else if (strcmp(ext, ".bin") == 0) { fprintf(stderr, "[I::%s] make juicer pre input from BIN file %s\n", __func__, link_file); - ret = make_juicer_pre_file_from_bin(link_file, agp1, fai, scale, !asm_mode, fo); + ret = make_juicer_pre_file_from_bin(link_file, agp1, fai, mq8, scale, !asm_mode, fo); } else { fprintf(stderr, "[E::%s] unknown link file format. File extension .bam, .bed or .bin is expected\n", __func__); exit(EXIT_FAILURE); @@ -736,7 +764,7 @@ static int assembly_to_agp(char *assembly, char *lift, sdict_t *sdict, FILE *fo) fprintf(fo, "scaffold_%d\t%ld\t%ld\t%d\tW\t%s\t%d\t%d\t%c\n", sid, slen + 1, slen + coords[cid * 3 + 2], ++fid, sdict->s[coords[cid * 3]].name, coords[cid * 3 + 1], coords[cid * 3 + 1] + coords[cid * 3 + 2] - 1, "-+"[sign]); slen += coords[cid * 3 + 2]; while (*eptr != '\n') { - fprintf(fo, "scaffold_%d\t%ld\t%ld\t%d\tN\t%d\tscaffold\tyes\tna\n", sid, slen + 1, slen + GAP_SZ, ++fid, GAP_SZ); + fprintf(fo, "scaffold_%d\t%ld\t%ld\t%d\tN\t%d\tscaffold\tyes\t%s\n", sid, slen + 1, slen + GAP_SZ, ++fid, GAP_SZ, LINK_EVIDENCE); slen += GAP_SZ; cid = strtol(eptr + 1, &fptr, 10); diff --git a/link.c b/link.c index 3fddc3a..ec6934b 100644 --- a/link.c +++ b/link.c @@ -44,6 +44,7 @@ #undef DEBUG #undef DEBUG_NOISE #undef DEBUG_ORIEN +#undef DEBUG_INTRA #define USE_MEDIAN_NORM #define MIN_RE_DENS .1 @@ -341,7 +342,7 @@ intra_link_mat_t *intra_link_mat_init(asm_dict_t *dict, re_cuts_t *re_cuts, uint //for (j = 0; j < p; ++j) // if (link->link[j] < .5) // link->link[j] = -FLT_MAX; -#ifdef DEBUG +#ifdef DEBUG_INTRA printf("[I::%s] %s bins: %d\n", __func__, dict->s[i].name, link->n); #endif } @@ -495,11 +496,12 @@ static inline void normalise_by_size(double *l) *l = i; } -intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq) +intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq, uint8_t mq) { - uint32_t i, j, n, b0, b1; - uint32_t buffer[BUFF_SIZE], k, m, i0, i1; + uint32_t i, j, k, m, n, i0, i1, b0, b1; uint64_t p0, p1; + int64_t magic_number; + uint8_t buffer[BUFF_SIZE * 17]; long pair_c, intra_c; intra_link_mat_t *link_mat; intra_link_t *link; @@ -509,28 +511,37 @@ intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_c if (fp == NULL) return 0; + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + return 0; + } + link_mat = use_gap_seq? intra_link_mat_init(dict, re_cuts, resolution) : intra_link_mat_init_sdict(dict->sdict, re_cuts, resolution); pair_c = 0; intra_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; + if (use_gap_seq) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0); b0 = (MAX(p0, 1) - 1) / resolution; b1 = (MAX(p1, 1) - 1) / resolution; } else { - i0 = buffer[i]; - i1 = buffer[i + 2]; - b0 = (MAX(buffer[i + 1], 1) - 1) / resolution; - b1 = (MAX(buffer[i + 3], 1) - 1) / resolution; + i0 = *(uint32_t *) (buffer + i); + i1 = *(uint32_t *) (buffer + i + 8); + b0 = (MAX(*(uint32_t *) (buffer + i + 4), 1) - 1) / resolution; + b1 = (MAX(*(uint32_t *) (buffer + i + 12), 1) - 1) / resolution; } if (i0 == i1) { - ++intra_c; link = &link_mat->links[i0]; if (link->n) { if (b0 > b1) @@ -538,11 +549,14 @@ intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_c k = (long) (link->n * 2 - b1 + b0 - 3) * (b1 - b0) / 2 + b1; link->link[k] += signf(link->link[k]); } + + ++intra_c; } + + ++pair_c; } - pair_c += m / 4; - if (m < BUFF_SIZE) { + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 0; break; @@ -567,11 +581,12 @@ intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_c return link_mat; } -inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius) +inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius, uint8_t mq) { - uint32_t i, j, n, k, b0, b1; - uint32_t buffer[BUFF_SIZE], m, i0, i1; + uint32_t i, j, k, m, n, i0, i1, b0, b1; uint64_t p0, p1; + int64_t magic_number; + uint8_t buffer[BUFF_SIZE * 17]; double l0, l1, a, na[4], nc[4]; long pair_c, inter_c, radius_c, noise_c; inter_link_mat_t *link_mat; @@ -582,19 +597,29 @@ inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_c if (fp == NULL) return 0; + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + return 0; + } + n = dict->n; link_mat = inter_link_mat_init(dict, re_cuts, resolution, radius); pair_c = inter_c = radius_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0); + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + ++pair_c; + + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; + + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0); if (i0 != i1) { - ++inter_c; - if (i0 > i1) { SWAP(uint32_t, i0, i1); SWAP(uint64_t, p0, p1); @@ -645,11 +670,12 @@ inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_c ++radius_c; } } + + ++inter_c; } } - pair_c += m / 4; - if (m < BUFF_SIZE) { + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 0; break; @@ -1091,11 +1117,12 @@ void inter_link_norms(inter_link_mat_t *link_mat, norm_t *norm, int use_estimate free(norms); } -int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict) +int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict, uint8_t mq) { - uint32_t i, j, n, na, k, b0, b1, b, ma, sma, n_ma; - uint32_t buffer[BUFF_SIZE], m, i0, i1; + uint32_t i, j, k, m, n, na, i0, i1, b0, b1, b, ma, sma, n_ma; uint64_t p0, p1; + int64_t magic_number; + uint8_t buffer[BUFF_SIZE * 17]; uint32_t *link, l; int8_t *directs; long pair_c, inter_c; @@ -1104,6 +1131,12 @@ int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict) fp = fopen(f, "r"); if (fp == NULL) return 0; + + m = fread(&magic_number, sizeof(int64_t), 1, fp); + if (!m || !is_valid_bin_header(magic_number)) { + fprintf(stderr, "[E::%s] not a valid BIN file\n", __func__); + return 0; + } n = dict->n; na = (long) n * (n - 1) / 2; @@ -1111,14 +1144,16 @@ int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict) pair_c = inter_c = 0; while (1) { - m = fread(&buffer, sizeof(uint32_t), BUFF_SIZE, fp); - for (i = 0; i < m; i += 4) { - sd_coordinate_conversion(dict, buffer[i], buffer[i + 1], &i0, &p0, 0); - sd_coordinate_conversion(dict, buffer[i + 2], buffer[i + 3], &i1, &p1, 0); + m = fread(buffer, sizeof(uint8_t), BUFF_SIZE * 17, fp); + + for (i = 0; i < m; i += 17) { + if (*(uint8_t *) (buffer + i + 16) < mq) + continue; - if (i0 != i1) { - ++inter_c; + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i), *(uint32_t *) (buffer + i + 4), &i0, &p0, 0); + sd_coordinate_conversion(dict, *(uint32_t *) (buffer + i + 8), *(uint32_t *) (buffer + i + 12), &i1, &p1, 0); + if (i0 != i1) { if (i0 > i1) { SWAP(uint32_t, i0, i1); SWAP(uint64_t, p0, p1); @@ -1127,10 +1162,13 @@ int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict) b1 = !((double) p1 / dict->s[i1].len > .5); b = (b0 << 1) | (!b1); ++link[b * na + (long) (n * 2 - i0 - 1) * i0 / 2 + i1 - i0 - 1]; + + ++inter_c; } + + ++pair_c; } - pair_c += m / 4; - if (m < BUFF_SIZE) { + if (m < BUFF_SIZE * 17) { if (ferror(fp)) return 0; break; @@ -1254,18 +1292,20 @@ static int32_t get_target_end(uint32_t *cigar, int n_cigar, int32_t s) return s; } -static char *parse_bam_rec(bam1_t *b, bam_header_t *h, uint8_t q, int32_t *s, int32_t *e, char **cname) +static char *parse_bam_rec(bam1_t *b, bam_header_t *h, uint8_t mq, int32_t *s, int32_t *e, uint8_t *q, char **cname) { - // 0x4 0x100 0x400 0x800 - if (b->core.flag & 0xD04) + // 0x4 0x100 0x200 0x400 0x800 + if (b->core.flag & 0xF04) return 0; *cname = h->target_name[b->core.tid]; - if (b->core.qual < q) { + if (b->core.qual < mq) { *s = -1; *e = -1; + *q = 0; } else { - *s = b->core.pos + 1; - *e = get_target_end(bam1_cigar(b), b->core.n_cigar, b->core.pos) + 1; + *s = b->core.pos; + *e = get_target_end(bam1_cigar(b), b->core.n_cigar, b->core.pos); + *q = b->core.qual; } return strdup(bam1_qname(b)); @@ -1273,13 +1313,13 @@ static char *parse_bam_rec(bam1_t *b, bam_header_t *h, uint8_t q, int32_t *s, in static char *parse_bam_rec1(bam1_t *b, bam_header_t *h, char **cname0, int32_t *s0, char **cname1, int32_t *s1) { - // 0x4 0x8 0x40 0x100 0x400 0x800 - if (b->core.flag & 0xD4C) + // 0x4 0x8 0x40 0x100 0x200 0x400 0x800 + if (b->core.flag & 0xF4C) return 0; *cname0 = h->target_name[b->core.tid]; - *s0 = b->core.pos + 1; + *s0 = b->core.pos; *cname1 = h->target_name[b->core.mtid]; - *s1 = b->core.mpos + 1; + *s1 = b->core.mpos; return bam1_qname(b); } @@ -1293,6 +1333,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 char *cname0, *cname1, *rname0, *rname1; int32_t s0, s1, e0, e1; uint32_t i0, i1, p0, p1; + uint8_t q, q0, q1; int8_t buff; long rec_c, pair_c, inter_c, intra_c; enum bam_sort_order so; @@ -1315,6 +1356,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[E::%s] cannot open file %s for writing\n", __func__, out); exit(EXIT_FAILURE); } + write_bin_header(fo); h = bam_header_read(fp); b = bam_init1(); @@ -1322,28 +1364,30 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 cname0 = cname1 = rname0 = rname1 = 0; s0 = s1 = e0 = e1 = 0; i0 = i1 = p0 = p1 = 0; + q0 = q1 = 0; rec_c = pair_c = inter_c = intra_c = 0; buff = 0; if (so == ORDER_NAME) { // sorted by read names while (bam_read1(fp, b) >= 0 ) { + if (++rec_c % 1000000 == 0) fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); if (buff == 0) { - rname0 = parse_bam_rec(b, h, mq, &s0, &e0, &cname0); + rname0 = parse_bam_rec(b, h, mq, &s0, &e0, &q0, &cname0); if (!rname0) continue; ++buff; } else if (buff == 1) { - rname1 = parse_bam_rec(b, h, mq, &s1, &e1, &cname1); + rname1 = parse_bam_rec(b, h, mq, &s1, &e1, &q1, &cname1); if (!rname1) continue; if (strcmp(rname0, rname1) == 0) { - ++pair_c; + buff = 0; - if (s0 > 0 && s1 > 0) { + if (s0 >= 0 && s1 >= 0) { i0 = sd_get(dict, cname0); i1 = sd_get(dict, cname1); @@ -1360,52 +1404,60 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[W::%s] sequence \"%s\" not found \n", __func__, cname1); } } else { - if (i0 == i1) - ++intra_c; - else - ++inter_c; p0 = s0 / 2 + e0 / 2 + (s0 & 1 && e0 & 1); p1 = s1 / 2 + e1 / 2 + (s1 & 1 && e1 & 1); if (i0 > i1) { SWAP(uint32_t, i0, i1); SWAP(uint32_t, p0, p1); } + q = MIN(q0, q1); fwrite(&i0, sizeof(uint32_t), 1, fo); fwrite(&p0, sizeof(uint32_t), 1, fo); fwrite(&i1, sizeof(uint32_t), 1, fo); fwrite(&p1, sizeof(uint32_t), 1, fo); + fwrite(&q, sizeof(uint8_t), 1, fo); + + if (i0 == i1) + ++intra_c; + else + ++inter_c; + + ++pair_c; } } + free(rname0); free(rname1); rname0 = 0; rname1 = 0; - buff = 0; } else { + buff = 1; + cname0 = cname1; s0 = s1; e0 = e1; + q0 = q1; free(rname0); rname0 = rname1; rname1 = 0; - buff = 1; } } } } else { // sorted by coordinates or others if (mq > 0) - fprintf(stderr, "[W::%s] BAM file is not sorted by read name. Filtering by mapping quality %hu suppressed \n", __func__, mq); - - while (bam_read1(fp, b) >= 0 ) { + fprintf(stderr, "[W::%s] BAM file is not sorted by read name. Filtering by mapping quality %hhu suppressed \n", __func__, mq); + q = 255; + + while (bam_read1(fp, b) >= 0) { + if (++rec_c % 1000000 == 0) fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); if(!parse_bam_rec1(b, h, &cname0, &s0, &cname1, &s1)) continue; - ++pair_c; - if (s0 > 0 && s1 > 0) { + if (s0 >= 0 && s1 >= 0) { i0 = sd_get(dict, cname0); i1 = sd_get(dict, cname1); @@ -1422,10 +1474,6 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[W::%s] sequence \"%s\" not found \n", __func__, cname1); } } else { - if (i0 == i1) - ++intra_c; - else - ++inter_c; if (i0 > i1) { SWAP(uint32_t, i0, i1); SWAP(uint32_t, s0, s1); @@ -1434,6 +1482,14 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 fwrite(&s0, sizeof(uint32_t), 1, fo); fwrite(&i1, sizeof(uint32_t), 1, fo); fwrite(&s1, sizeof(uint32_t), 1, fo); + fwrite(&q, sizeof(uint8_t), 1, fo); + + if (i0 == i1) + ++intra_c; + else + ++inter_c; + + ++pair_c; } } } @@ -1454,7 +1510,7 @@ void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8 bam_close(fp); fclose(fo); - fprintf(stderr, "[I::%s] dumped %ld read pairs: %ld intra links + %ld inter links \n", __func__, pair_c, intra_c, inter_c); + fprintf(stderr, "[I::%s] dumped %ld read pairs from %ld records: %ld intra links + %ld inter links \n", __func__, pair_c, rec_c, intra_c, inter_c); } void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, const char *out) @@ -1465,6 +1521,7 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 ssize_t read; char cname0[4096], cname1[4096], rname0[4096], rname1[4096]; uint32_t s0, s1, e0, e1, i0, i1, p0, p1; + uint8_t q, q0, q1; int8_t buff; long rec_c, pair_c, inter_c, intra_c; @@ -1485,19 +1542,26 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[E::%s] cannot open file %s for writing\n", __func__, out); exit(EXIT_FAILURE); } - + write_bin_header(fo); + s0 = s1 = e0 = e1 = 0; i0 = i1 = p0 = p1 = 0; + q0 = q1 = 0; rec_c = pair_c = inter_c = intra_c = 0; buff = 0; while ((read = getline(&line, &ln, fp)) != -1) { + + if (++rec_c % 1000000 == 0) + fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); + + if (buff == 0) { - sscanf(line, "%s %u %u %s", cname0, &s0, &e0, rname0); + sscanf(line, "%s %u %u %s %hhu %*s", cname0, &s0, &e0, rname0, &q0); ++buff; } else if (buff == 1) { - sscanf(line, "%s %u %u %s", cname1, &s1, &e1, rname1); + sscanf(line, "%s %u %u %s %hhu %*s", cname1, &s1, &e1, rname1, &q1); if (is_read_pair(rname0, rname1)) { - ++pair_c; + buff = 0; i0 = sd_get(dict, cname0); i1 = sd_get(dict, cname1); @@ -1515,33 +1579,36 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 fprintf(stderr, "[W::%s] sequence \"%s\" not found \n", __func__, cname1); } } else { - if (i0 == i1) - ++intra_c; - else - ++inter_c; p0 = s0 / 2 + e0 / 2 + (s0 & 1 && e0 & 1); p1 = s1 / 2 + e1 / 2 + (s1 & 1 && e1 & 1); if (i0 > i1) { SWAP(uint32_t, i0, i1); SWAP(uint32_t, p0, p1); } + q = MIN(q0, q1); fwrite(&i0, sizeof(uint32_t), 1, fo); fwrite(&p0, sizeof(uint32_t), 1, fo); fwrite(&i1, sizeof(uint32_t), 1, fo); fwrite(&p1, sizeof(uint32_t), 1, fo); + fwrite(&q, sizeof(uint8_t), 1, fo); + + if (i0 == i1) + ++intra_c; + else + ++inter_c; + + ++pair_c; } - buff = 0; } else { + buff = 1; + strcpy(cname0, cname1); s0 = s1; e0 = e1; + q0 = q1; strcpy(rname0, rname1); - buff = 1; } } - - if (++rec_c % 1000000 == 0) - fprintf(stderr, "[I::%s] %ld million records processed, %ld read pairs \n", __func__, rec_c / 1000000, pair_c); } for (k = 0; k < kh_end(hmseq); ++k) @@ -1555,6 +1622,6 @@ void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8 fclose(fp); fclose(fo); - fprintf(stderr, "[I::%s] dumped %ld read pairs: %ld intra links + %ld inter links \n", __func__, pair_c, intra_c, inter_c); + fprintf(stderr, "[I::%s] dumped %ld read pairs from %ld records: %ld intra links + %ld inter links \n", __func__, pair_c, rec_c, intra_c, inter_c); } diff --git a/link.h b/link.h index ea5c7ba..197a784 100644 --- a/link.h +++ b/link.h @@ -88,8 +88,8 @@ extern "C" { intra_link_mat_t *intra_link_mat_init(asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution); intra_link_mat_t *intra_link_mat_init_sdict(sdict_t *dict, re_cuts_t *re_cuts, uint32_t resolution); inter_link_mat_t *inter_link_mat_init(asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius); -intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq); -inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius); +intra_link_mat_t *intra_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, int use_gap_seq, uint8_t mq); +inter_link_mat_t *inter_link_mat_from_file(const char *f, asm_dict_t *dict, re_cuts_t *re_cuts, uint32_t resolution, uint32_t radius, uint8_t mq); intra_link_t *get_intra_link(intra_link_mat_t *link_mat, uint32_t i, uint32_t j); inter_link_t *get_inter_link(inter_link_mat_t *link_mat, uint32_t i, uint32_t j); norm_t *calc_norms(intra_link_mat_t *link_mat); @@ -104,7 +104,7 @@ void intra_link_mat_destroy(intra_link_mat_t *link_mat); void inter_link_mat_destroy(inter_link_mat_t *link_mat); void norm_destroy(norm_t *norm); double *get_max_inter_norms(inter_link_mat_t *link_mat, asm_dict_t *dict); -int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict); +int8_t *calc_link_directs_from_file(const char *f, asm_dict_t *dict, uint8_t mq); void calc_link_directs(inter_link_mat_t *link_mat, double min_norm, asm_dict_t *dict, int8_t *directs); void dump_links_from_bam_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, const char *out); void dump_links_from_bed_file(const char *f, const char *fai, uint32_t ml, uint8_t mq, const char *out); diff --git a/sdict.c b/sdict.c index cf980e1..5499ff3 100644 --- a/sdict.c +++ b/sdict.c @@ -594,6 +594,17 @@ char nucl_toupper[] = { 'N', 'N', 'N', 'N', 'T', 'N', 'N', 'N', 'N', 'N', 'N', 123, 124, 125, 126, 127 }; +static int is_empty_line(char *line) +{ + char *c; + c = line; + while (isspace((unsigned char) *c)) + c++; + if(*c == 0) + return 1; + return 0; +} + void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int line_wd, int un_oris) { FILE *agp_in; @@ -602,7 +613,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li sd_seq_t s; size_t ln = 0; ssize_t read; - char sname[256], type[4], cname[256], cstarts[16], cends[16], oris[16]; + char sname[256], type[4], cname[256], cstarts[16], cends[16], oris[256]; char *name = NULL; uint32_t c; int64_t i, l, cstart, cend, ns, L; @@ -616,9 +627,10 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li dict = make_sdict_from_fa(fa, 0); l = L = ns = 0; while ((read = getline(&line, &ln, agp_in)) != -1) { - if (!strncmp(line, "#", 1)) - // header lines + if (is_empty_line(line) || !strncmp(line, "#", 1)) + // header or empty lines continue; + sname[0] = type[0] = cname[0] = cstarts[0] = cends[0] = oris[0] = '\0'; sscanf(line, "%s %*s %*s %*s %s %s %s %s %s", sname, type, cname, cstarts, cends, oris); if (!strncmp(type, "N", 1) || !strncmp(type, "U", 1)) { cend = strtoul(cname, NULL, 10); @@ -649,7 +661,7 @@ void write_fasta_file_from_agp(const char *fa, const char *agp, FILE *fo, int li cend = strtoul(cends, NULL, 10); c = sd_get(dict, cname); if (c == UINT32_MAX) { - fprintf(stderr, "[E::%s] sequence %s not found\n", __func__, cname); + fprintf(stderr, "[E::%s] sequence component '%s' not found\n", __func__, cname); exit(EXIT_FAILURE); } s = dict->s[c]; @@ -707,7 +719,7 @@ void write_segs_to_agp(sd_seg_t *segs, uint32_t n, sdict_t *sd, uint32_t s, FILE fprintf(fp, "scaffold_%u\t%lu\t%lu\t%d\tW\t%s\t%u\t%u\t%c\n", s, len + 1, len + seg.y, ++t, sd->s[seg.c >> 1].name, seg.x + 1, seg.x + seg.y, "+-"[seg.c & 1]); len += seg.y; if (i != n - 1) { - fprintf(fp, "scaffold_%u\t%lu\t%lu\t%d\tN\t%d\tscaffold\tyes\tna\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ); + fprintf(fp, "scaffold_%u\t%lu\t%lu\t%d\tN\t%d\tscaffold\tyes\t%s\n", s, len + 1, len + GAP_SZ, ++t, GAP_SZ, LINK_EVIDENCE); len += GAP_SZ; } } @@ -770,7 +782,7 @@ void write_asm_dict_to_agp(asm_dict_t *dict, char *out) fprintf(agp_out, "%s\t%lu\t%lu\t%u\tW\t%s\t%u\t%u\t%c\n", dict->s[i].name, len + 1, len + seg.y, ++t, dict->sdict->s[seg.c >> 1].name, seg.x + 1, seg.x + seg.y, "+-"[seg.c & 1]); len += seg.y; if (j != n - 1) { - fprintf(agp_out, "%s\t%lu\t%lu\t%d\tN\t%d\tscaffold\tyes\tna\n", dict->s[i].name, len + 1, len + GAP_SZ, ++t, GAP_SZ); + fprintf(agp_out, "%s\t%lu\t%lu\t%d\tN\t%d\tscaffold\tyes\t%s\n", dict->s[i].name, len + 1, len + GAP_SZ, ++t, GAP_SZ, LINK_EVIDENCE); len += GAP_SZ; } } diff --git a/yahs.c b/yahs.c index 5cc3bd3..bf54f69 100644 --- a/yahs.c +++ b/yahs.c @@ -41,7 +41,7 @@ #include "enzyme.h" #include "asset.h" -#define YAHS_VERSION "1.1" +#define YAHS_VERSION "1.2a" #undef DEBUG #undef DEBUG_ERROR_BREAK @@ -129,7 +129,7 @@ graph_t *build_graph_from_links(inter_link_mat_t *link_mat, asm_dict_t *dict, do return g; } -int run_scaffolding(char *fai, char *agp, char *link_file, uint32_t ml, re_cuts_t *re_cuts, char *out, int resolution, double *noise, long rss_limit) +int run_scaffolding(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, re_cuts_t *re_cuts, char *out, int resolution, double *noise, long rss_limit) { //TODO: adjust wt thres by resolution sdict_t *sdict = make_sdict_from_index(fai, ml); @@ -157,7 +157,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, uint32_t ml, re_cuts_ } rss_limit -= rss_intra; fprintf(stderr, "[I::%s] starting norm estimation...\n", __func__); - intra_link_mat_t *intra_link_mat = intra_link_mat_from_file(link_file, dict, re_cuts, resolution, 1); + intra_link_mat_t *intra_link_mat = intra_link_mat_from_file(link_file, dict, re_cuts, resolution, 1, mq); #ifdef DEBUG_RAM_USAGE printf("[I::%s] RAM peak: %.3fGB\n", __func__, (double) peakrss() / GB); @@ -186,7 +186,7 @@ int run_scaffolding(char *fai, char *agp, char *link_file, uint32_t ml, re_cuts_ } rss_limit -= rss_inter; fprintf(stderr, "[I::%s] starting link estimation...\n", __func__); - inter_link_mat_t *inter_link_mat = inter_link_mat_from_file(link_file, dict, re_cuts, resolution, norm->r); + inter_link_mat_t *inter_link_mat = inter_link_mat_from_file(link_file, dict, re_cuts, resolution, norm->r, mq); #ifdef DEBUG_RAM_USAGE printf("[I::%s] RAM peak: %.3fGB\n", __func__, (double) peakrss() / GB); @@ -276,7 +276,7 @@ int contig_error_break(char *fai, char *link_file, uint32_t ml, char *out) sdict = make_sdict_from_index(fai, ml); dict = make_asm_dict_from_sdict(sdict); - dist_thres = estimate_dist_thres_from_file(link_file, dict, ec_min_frac, ec_resolution); + dist_thres = estimate_dist_thres_from_file(link_file, dict, ec_min_frac, ec_resolution, 0); dist_thres = MAX(dist_thres, ec_min_window); fprintf(stderr, "[I::%s] dist threshold for contig error break: %d\n", __func__, dist_thres); asm_destroy(dict); @@ -285,7 +285,7 @@ int contig_error_break(char *fai, char *link_file, uint32_t ml, char *out) ec_round = err_no = 0; while (1) { dict = ec_round? make_asm_dict_from_agp(sdict, out1) : make_asm_dict_from_sdict(sdict); - link_mat_t *link_mat = link_mat_from_file(link_file, dict, dist_thres, ec_bin, .0, ec_move_avg); + link_mat_t *link_mat = link_mat_from_file(link_file, dict, dist_thres, ec_bin, .0, ec_move_avg, 0); #ifdef DEBUG_ERROR_BREAK printf("[I::%s] ec_round %u link matrix\n", __func__, ec_round); print_link_mat(link_mat, dict, stdout); @@ -318,7 +318,7 @@ int contig_error_break(char *fai, char *link_file, uint32_t ml, char *out) return ec_round; } -int scaffold_error_break(char *fai, char *link_file, uint32_t ml, char *agp, int flank_size, double noise, char *out) +int scaffold_error_break(char *fai, char *link_file, uint32_t ml, uint8_t mq, char *agp, int flank_size, double noise, char *out) { int dist_thres; sdict_t *sdict = make_sdict_from_index(fai, ml); @@ -328,7 +328,7 @@ int scaffold_error_break(char *fai, char *link_file, uint32_t ml, char *agp, int //dist_thres = estimate_dist_thres_from_file(link_file, dict, ec_min_frac, ec_resolution); //dist_thres = MAX(dist_thres, ec_min_window); //fprintf(stderr, "[I::%s] dist threshold for scaffold error break: %d\n", __func__, dist_thres); - link_mat_t *link_mat = link_mat_from_file(link_file, dict, dist_thres, ec_bin, noise, ec_move_avg); + link_mat_t *link_mat = link_mat_from_file(link_file, dict, dist_thres, ec_bin, noise, ec_move_avg, mq); #ifdef DEBUG_ERROR_BREAK printf("[I::%s] link matrix\n", __func__); @@ -366,7 +366,7 @@ static void print_asm_stats(uint64_t *n_stats, uint32_t *l_stats) #endif } -int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, char *out, int *resolutions, int nr, re_cuts_t *re_cuts, int no_contig_ec, int no_scaffold_ec) +int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, uint8_t mq, char *out, int *resolutions, int nr, re_cuts_t *re_cuts, int no_contig_ec, int no_scaffold_ec) { int ec_round, re, r, rc; char *out_fn, *out_agp, *out_agp_break; @@ -436,12 +436,12 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, char *out, int sprintf(out_fn, "%s_r%02d", out, r); // noise per unit - re = run_scaffolding(fai, out_agp_break, link_file, ml, re_cuts, out_fn, resolutions[r - 1], &noise, rss_limit); + re = run_scaffolding(fai, out_agp_break, link_file, ml, mq, re_cuts, out_fn, resolutions[r - 1], &noise, rss_limit); if (!re) { sprintf(out_agp, "%s_r%02d.agp", out, r); if (no_scaffold_ec == 0) { sprintf(out_agp_break, "%s_r%02d_break.agp", out, r); - scaffold_error_break(fai, link_file, ml, out_agp, resolutions[r - 1], noise, out_agp_break); + scaffold_error_break(fai, link_file, ml, mq, out_agp, resolutions[r - 1], noise, out_agp_break); } else { sprintf(out_agp_break, "%s", out_agp); } @@ -489,7 +489,7 @@ int run_yahs(char *fai, char *agp, char *link_file, uint32_t ml, char *out, int } #ifndef DEBUG_GT4G -static int default_resolutions[13] = {10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000, 5000000, 10000000, 20000000, 50000000, 100000000}; +static int default_resolutions[15] = {10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000, 5000000, 10000000, 20000000, 50000000, 100000000, 200000000, 500000000}; #else static int default_resolutions[13] = {50000, 100000, 250000, 500000, 1000000, 2500000, 5000000, 10000000, 25000000, 50000000, 100000000, 250000000, 500000000}; #endif @@ -517,8 +517,12 @@ static int default_nr(char *fai, uint32_t ml) max_res = 20000000; else if (genome_size < 5000000000) max_res = 50000000; - else + else if (genome_size < 10000000000) max_res = 100000000; + else if (genome_size < 20000000000) + max_res = 200000000; + else + max_res = 500000000; nr = 0; while (nr < sizeof(default_resolutions) / sizeof(int) && default_resolutions[nr] <= max_res) @@ -721,12 +725,12 @@ int main(int argc, char *argv[]) link_bin_file = malloc(strlen(out) + 5); sprintf(link_bin_file, "%s.bin", out); fprintf(stderr, "[I::%s] dump hic links (BAM) to binary file %s\n", __func__, link_bin_file); - dump_links_from_bam_file(link_file, fai, ml, mq8, link_bin_file); + dump_links_from_bam_file(link_file, fai, ml, 0, link_bin_file); } else if (strcmp(ext, ".bed") == 0) { link_bin_file = malloc(strlen(out) + 5); sprintf(link_bin_file, "%s.bin", out); fprintf(stderr, "[I::%s] dump hic links (BED) to binary file %s\n", __func__, link_bin_file); - dump_links_from_bed_file(link_file, fai, ml, mq8, link_bin_file); + dump_links_from_bed_file(link_file, fai, ml, 0, link_bin_file); } else if (strcmp(ext, ".bin") == 0) { link_bin_file = malloc(strlen(link_file) + 1); sprintf(link_bin_file, "%s", link_file); @@ -756,7 +760,7 @@ int main(int argc, char *argv[]) printf("[I::%s] ec[S]: %d\n", __func__, no_scaffold_ec); #endif - ret = run_yahs(fai, agp, link_bin_file, ml, out, resolutions, nr, re_cuts, no_contig_ec, no_scaffold_ec); + ret = run_yahs(fai, agp, link_bin_file, ml, mq8, out, resolutions, nr, re_cuts, no_contig_ec, no_scaffold_ec); if (ret == 0) { agp_final = (char *) malloc(strlen(out) + 35);