-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.h
More file actions
231 lines (211 loc) · 9.11 KB
/
utils.h
File metadata and controls
231 lines (211 loc) · 9.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#ifndef __UTILS_H__
#define __UTILS_H__
#include "struct_def.h"
#include "lps.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
#define MAX(a,b) (((a)>(b))?(a):(b))
#define SV_LEN_BOUNDARY 50
/**
* Performs binary search on a sorted array to find the index of a given key.
*
* @param arr Pointer to the sorted array.
* @param size Number of elements in the array.
* @param key The value to search for.
* @return Index of the key if found, otherwise -1.
*/
int binary_search(uint64_t *arr, uint64_t size, uint64_t key);
/**
* Sorts an array using the quicksort algorithm.
*
* @param array Pointer to the array to be sorted.
* @param low Starting index of the array (or subarray).
* @param high Ending index of the array (or subarray).
*/
void quicksort(uint64_t *arr, int low, int high);
/**
* @brief Opens a file in read mode ("r").
*
* This function attempts to open the given file for reading.
* If the file cannot be opened, an error message is printed to stderr
* and the program exits with EXIT_FAILURE.
*
* @param file Pointer to a FILE* that will store the opened file handle.
* @param filename Path to the file to open.
*/
void open_file_r(FILE **file, const char *filename);
/**
* @brief Opens a file in write mode ("w").
*
* This function attempts to open the given file for writing.
* If the file cannot be opened, an error message is printed to stderr
* and the program exits with EXIT_FAILURE.
*
* @param file Pointer to a FILE* that will store the opened file handle.
* @param filename Path to the file to open.
*/
void open_file_w(FILE **file, const char *filename);
/**
* @brief Creates and opens output files (segment + link) and writes initial headers/logs.
*
* This function creates output file names based on the input GFA path:
* - "<gfa_path>.s.0" for segment output
* - "<gfa_path>.l.0" for link output
*
* It opens both files in write mode and writes the GFA header line
* into the segment file.
*
* It also creates a log file:
* - "lcpan.log" if args->prefix is NULL
* - "<prefix>.log" otherwise
*
* The log file stores basic run information (tool name, gfa path, thread number).
*
* @param args Pointer to the program arguments/options struct.
* @param out_segment Output pointer where the opened segment file handle will be stored.
* @param out_link Output pointer where the opened link file handle will be stored.
*/
void open_files(struct opt_arg *args, FILE **out_segment, FILE **out_link);
/**
* Prints three sequences as single segment in GFA or rGFA format.
*
* @param id Sequence identifier.
* @param seq1 The first nucleotide sequence.
* @param seq1_len Length of the first sequence.
* @param seq2 The second nucleotide sequence.
* @param seq2_len Length of the second sequence.
* @param seq3 The third nucleotide sequence.
* @param seq3_len Length of the third sequence.
* @param seq_name Name of the sequence.
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq3(uint64_t id, const char *seq1, int seq1_len, const char *seq2, int seq2_len, const char *seq3, int seq3_len, const char *seq_name, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints two sequences as single segment in GFA or rGFA format.
*
* @param id Sequence identifier.
* @param seq1 The first nucleotide sequence.
* @param seq1_len Length of the first sequence.
* @param seq2 The second nucleotide sequence.
* @param seq2_len Length of the second sequence.
* @param seq_name Name of the sequence.
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq2(uint64_t id, const char *seq1, int seq1_len, const char *seq2, int seq2_len, const char *seq_name, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints a sequence in GFA or rGFA format.
*
* @param id Sequence identifier.
* @param seq The nucleotide sequence.
* @param seq_len Length of the sequence.
* @param seq_name Name of the sequence.
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq(uint64_t id, const char *seq, int seq_len, const char *seq_name, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints three sequences as single segment in GFA or rGFA format. Unlike `print_seq`, this function
* does not print index, but prints >id to index information (`SN:Z:`)
*
* @param id Sequence identifier.
* @param seq1 The first nucleotide sequence.
* @param seq1_len Length of the first sequence.
* @param seq2 The second nucleotide sequence.
* @param seq2_len Length of the second sequence.
* @param seq3 The third nucleotide sequence.
* @param seq3_len Length of the third sequence.
* @param seq_name The name/ID of the sequence.
* @param order The index of the sequence (order).
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq3_vg(uint64_t id, const char *seq1, int seq1_len, const char *seq2, int seq2_len, const char *seq3, int seq3_len, const char *seq_name, int order, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints two sequences as single segment in GFA or rGFA format. Unlike `print_seq`, this function
* does not print index, but prints >id to index information (`SN:Z:`)
*
* @param id Sequence identifier.
* @param seq1 The first nucleotide sequence.
* @param seq1_len Length of the first sequence.
* @param seq2 The second nucleotide sequence.
* @param seq2_len Length of the second sequence.
* @param seq_name The name/ID of the sequence.
* @param order The index of the sequence (order).
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq2_vg(uint64_t id, const char *seq1, int seq1_len, const char *seq2, int seq2_len, const char *seq_name, int order, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints a sequence in GFA or rGFA format. Unlike `print_seq`, this function
* does not print index, but prints >id to index information (`SN:Z:`)
*
* @param id Sequence identifier.
* @param seq The nucleotide sequence.
* @param seq_len Length of the sequence.
* @param seq_name The name/ID of the sequence.
* @param order The index of the sequence (order).
* @param start Start position of the sequence.
* @param rank Rank of the sequence.
* @param is_rgfa Flag to determine if rGFA format should be used.
* @param out Output file stream.
*/
void print_seq_vg(uint64_t id, const char *seq, int seq_len, const char *seq_name, int order, int start, int rank, int is_rgfa, FILE *out);
/**
* Prints a link between two sequences in GFA format.
*
* @param id1 Identifier of the first sequence.
* @param sign1 Orientation of the first sequence ('+' or '-').
* @param id2 Identifier of the second sequence.
* @param sign2 Orientation of the second sequence ('+' or '-').
* @param overlap Length of the overlap between the sequences.
* @param out Output file stream.
*/
void print_link(uint64_t id1, char sign1, uint64_t id2, char sign2, uint64_t overlap, FILE *out);
/**
* Finds the latest core index before a given range and the first core index after it.
*
* @param start_loc Start location of the range.
* @param end_loc End location of the range.
* @param chrom Pointer to the chromosome structure containing core regions.
* @param start_index Estimated start index of LCP core to search.
* @param latest_core_index Pointer to store the latest core index before start_loc.
* @param first_core_after Pointer to store the first core index after end_loc.
*/
void find_boundaries(uint64_t start_loc, uint64_t end_loc, const struct chr *chrom, uint64_t start_index, uint64_t *latest_core_index, uint64_t *first_core_after);
/**
* Modifies start indices of the LCP cores if no overlap is allowed.
*
* @param seqs The LCP cores to be refined.
* @param no_overlap Boolean indicator that makes refinement is it is set to 1.
*/
void refine_seqs(struct ref_seq *seqs, int no_overlap);
/**
* Modifies start indices of the LCP cores if no overlap is allowed.
*
* @param str The LCP cores to be refined.
* @param no_overlap Boolean indicator that makes refinement is it is set to 1.
*/
void refine_seq(struct lps *str, int no_overlap);
/**
* Prints all Paths in given sequences. The ids should be initialized
*
* @param ref_seq The reference sequences
* @param out Output file to write path.
*/
void print_path(const struct ref_seq *seqs, FILE *out);
#endif