@@ -40,7 +40,15 @@ namespace bio::map_io
4040 * \ingroup map_io
4141 * \details
4242 *
43- * TODO
43+ * Each header line begins with the character `@` followed by one of the two-letter header record type codes
44+ * defined in this section. In the header, each line is tab-delimited and, apart from `@CO` lines, each data field
45+ * follows a format `TAG:VALUE` where TAG is a two-character string that defines the format and content of
46+ * VALUE. Thus header lines match `/^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/` or are comment lines staring
47+ * with `@CO` followed by a tab and any character sequence.
48+ * Within each (non-`@CO`) header line, no field tag may appear more than once and the order in which the fields
49+ * appear is not significant.
50+ *
51+ * \sa https://samtools.github.io/hts-specs/SAMv1.pdf
4452 */
4553class header
4654{
@@ -57,7 +65,6 @@ class header
5765 header & operator =(header &&) = default ; // !< Defaulted.
5866
5967 /* !\brief Construct from a range of reference ids.
60- * \param[in] The plain text header.
6168 * \param[in] ref_ids The range over reference ids to redirect the pointer at.
6269 */
6370 template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
@@ -95,6 +102,7 @@ class header
95102 // !\brief The reference sequence names.
96103 std::vector<std::string_view> reference_names;
97104
105+ // !\brief Additional information to the reference sequence (same ordering as `reference_names`).
98106 std::vector<std::tuple<int32_t , std::string>> reference_names_info{};
99107
100108 // !\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
@@ -103,6 +111,7 @@ class header
103111 // !\brief Whether reference sequence names were given to the header on construction.
104112 bool reference_names_given_on_construction{false };
105113
114+ // !\brief Print a B.I.O warning message with current line number in diagnostic.
106115 /* [[noreturn]] compiler says this returns something...? */ void warning (auto const &... messages) const
107116 {
108117 // if (print_warnings)
@@ -114,22 +123,22 @@ class header
114123 // }
115124 }
116125public:
117- /* !\name [@ HD] File-level meta data
126+ /* !\name [HD] File-level meta data
118127 * \brief You can directly edit these member variables.
119128 * \{
120129 */
121- std::string format_version{}; // !< [@ HD VN] The file format version. Note: this is overwritten by our formats on output.
122- std::string sorting{}; // !< [@ HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
123- std::string grouping{}; // !< [@ HD GO] The grouping of the file. SAM: [none, query, reference].
124- std::string subsorting{}; // !< [@ HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate](:[A-Za-z0-9_-]+)+.
130+ std::string format_version{}; // !< [HD VN] The file format version. Note: this is overwritten by our formats on output.
131+ std::string sorting{}; // !< [HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
132+ std::string grouping{}; // !< [HD GO] The grouping of the file. SAM: [none, query, reference].
133+ std::string subsorting{}; // !< [HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate]` (:[A-Za-z0-9_-]+)+` .
125134 // !\}
126135
127- /* !\name [@ SQ] Reference sequence dictionary
136+ /* !\name [SQ] Reference sequence dictionary
128137 * \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
129138 * \{
130139 */
131140
132- /* !\brief [@ SQ SN] Reference sequence names
141+ /* !\brief [SQ SN] Reference sequence names
133142 *
134143 * \details
135144 *
@@ -139,11 +148,11 @@ class header
139148 * 1) Reference id information is provided on construction. In this case, no copy is made but this function
140149 * gives you a reference to the provided range. When reading the header or the records, their reference
141150 * information will be checked against the given input.
142- * 2) No reference information is provided on construction but the \ @SQ tags are present in the header.
151+ * 2) No reference information is provided on construction but the ` @SQ` tags are present in the header.
143152 * In this case, the reference id information is extracted from the header and this member function provides
144153 * access to them. When reading the records, their reference id information will be checked against the header
145154 * information.
146- * 3) No reference information is provided on construction an no \ @SQ tags are present in the header.
155+ * 3) No reference information is provided on construction an no ` @SQ` tags are present in the header.
147156 * In this case, the reference information is parsed from the records field::ref_id and stored in the header.
148157 * This member function then provides access to the unique list of reference names encountered in the records.
149158 */
@@ -152,11 +161,11 @@ class header
152161 return reference_names;
153162 }
154163
155- /* !\brief [@ SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
164+ /* !\brief [SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
156165 *
157166 * \details
158167 *
159- * The reference information store the length (\ @LN tag) and
168+ * The reference information store the length (` @LN` tag) and
160169 * additional information of each reference sequence in the file. The record
161170 * must then store only the index of the reference.
162171 * The name and length information are required if the header is provided
@@ -166,17 +175,17 @@ class header
166175 *
167176 * The additional information (2nd tuple entry) must model
168177 * the following formatting rules: The information is given in a tab separated
169- * TAG:VALUE format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
178+ * ` TAG:VALUE` format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
170179 * The following information and rules apply for each tag (taken from the SAM specs):
171180 *
172181 * * **AH:** Indicates that this sequence is an alternate locus. The value is the locus in the primary assembly for
173- * which this sequence is an alternative, in the format ' chr:start-end', ' chr' (if known), or '*' (if
174- * unknown), where ' chr' is a sequence in the primary assembly. Must not be present on sequences in the
182+ * which this sequence is an alternative, in the format ` chr:start-end`, ` chr` (if known), or `*` (if
183+ * unknown), where ` chr` is a sequence in the primary assembly. Must not be present on sequences in the
175184 * primary assembly.
176185 * * **AN:** Alternative reference sequence names. A comma-separated list of alternative names that tools may use
177186 * when referring to this reference sequence. These alternative names are not used elsewhere within the
178187 * SAM file; in particular, they must not appear in alignment records’ RNAME or RNEXT fields. regular
179- * expression : name (, name )* where name is [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*
188+ * expression : ` name (, name )*` where name is ` [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*`.
180189 * * **AS:** Genome assembly identifier.
181190 * * **M5:** MD5 checksum of the sequence. See Section 1.3.1
182191 * * **SP:** Species.
@@ -204,7 +213,7 @@ class header
204213 }
205214 // !\}
206215
207- /* !\name [@ RG] Read groups
216+ /* !\name [RG] Read groups
208217 * \brief You can directly edit these member variables.
209218 * \{
210219 */
@@ -215,7 +224,7 @@ class header
215224 * The read group list stores the group id and
216225 * additional information of each read group in the file. The record
217226 * may store a RG tag information referencing one of the stored id's.
218- * The id information is required if the @RG header line is provided.
227+ * The id information is required if the \ @RG header line is provided.
219228 *
220229 * The additional information (2nd tuple entry) for the SAM format must follow
221230 * the following formatting rules: The information is given in a tab separated
@@ -225,13 +234,13 @@ class header
225234 * * **BC:** Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by
226235 * the sequencing machine in the absence of errors. If there are several barcodes for the sample/library
227236 * (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes
228- * separating them with hyphens ('-' ).
237+ * separating them with hyphens (`-` ).
229238 * * **CN:** Name of sequencing center producing the read.
230239 * * **DS:** Description. UTF-8 encoding may be used.
231240 * * **DT:** Date the run was produced (ISO8601 date or date/time).
232241 * * **FO:** Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each
233242 * read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other
234- * characters. Format : /\*\|[ACMGRSVTWYHKDBN]+/
243+ * characters. Format : ` /\*\|[ACMGRSVTWYHKDBN]+/`
235244 * * **KS:** The array of nucleotide bases that correspond to the key sequence of each read.
236245 * * **LB:** Library.
237246 * * **PG:** Programs used for processing the read group.
@@ -245,7 +254,7 @@ class header
245254 std::vector<std::pair<std::string, std::string>> read_groups{};
246255 // !\}
247256
248- /* !\name [@ PG] Programm information
257+ /* !\name [PG] Programm information
249258 * \brief You can directly edit these member variables.
250259 * \{
251260 */
@@ -263,7 +272,7 @@ class header
263272 std::vector<program_info_t > program_infos{}; // !< The list of program information.
264273 // !\}
265274
266- /* !\name [@ CO] Comments
275+ /* !\name [CO] Comments
267276 * \brief You can directly edit these member variables.
268277 * \{
269278 */
@@ -272,17 +281,16 @@ class header
272281};
273282
274283/* !\brief Reads the SAM header.
275- * \tparam stream_view_type The type of the stream as a view.
276- * \param[in, out] stream_view The stream view to iterate over.
284+ * \param[in] header_string The full header as a std::string_view.
277285 *
278- * \throws seqan3 ::format_error if any unexpected character or format is encountered.
286+ * \throws bio::map_io ::format_error if any unexpected character or format is encountered.
279287 *
280288 * \details
281289 *
282290 * Reading the header format is done according to the official
283291 * [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
284292 *
285- * The function throws a seqan3 ::format_error if any unknown tag was encountered. It will also fail if the format is
293+ * The function throws a bio::map_io ::format_error if any unknown tag was encountered. It will also fail if the format is
286294 * not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
287295 * error.
288296 */
0 commit comments