Skip to content

Commit df0dfdc

Browse files
committed
header reference works
1 parent 6e5f334 commit df0dfdc

File tree

7 files changed

+231
-199
lines changed

7 files changed

+231
-199
lines changed

include/bio/format/sam_input_handler.hpp

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <string_view>
2222
#include <vector>
2323

24+
#include <seqan3/core/debug_stream.hpp>
2425
#include <seqan3/utility/type_list/traits.hpp>
2526
#include <seqan3/io/sam_file/detail/cigar.hpp>
2627

@@ -60,13 +61,13 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
6061
//!\brief The raw record.
6162
raw_record_type raw_record;
6263
//!\brief The header.
63-
map_io::header<std::vector<std::string>> header;
64+
map_io::header header;
6465
//!\brief Lowlevel stream iterator.
6566
lowlevel_iterator file_it;
6667
//!\brief Cache of the chromosome string.
67-
std::string last_chrom;
68+
std::string last_rname;
6869
//!\brief Cache of the chromosome index.
69-
int32_t last_chrom_idx = -1;
70+
int32_t last_rname_idx = -1;
7071
//!\brief Current line number in file.
7172
size_t line = 0;
7273
//!\brief Whether to print print_warnings or not.
@@ -80,6 +81,16 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
8081
throw format_error{message};
8182
}
8283

84+
/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
85+
{
86+
if (print_warnings)
87+
{
88+
seqan3::debug_stream << "[B.I.O sam format warning in line " << line << "] ";
89+
(seqan3::debug_stream << ... << messages);
90+
seqan3::debug_stream << std::endl;
91+
}
92+
}
93+
8394
void read_raw_record()
8495
{
8596
++line;
@@ -126,9 +137,27 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
126137
std::string_view raw_field = get<field::rname>(raw_record);
127138

128139
if (raw_field != "*")
129-
parse_field_aux(raw_field, parsed_field);
140+
{
141+
size_t rname_pos;
142+
143+
if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end())
144+
{ // rname name was not in header, insert!
145+
header.push_back_rname(raw_field, raw_field.size(), "");
146+
147+
rname_pos = header.rnames().size() - 1;
148+
149+
warning("The reference sequence name \"", raw_field, "\" is not present in the header.");
150+
}
151+
else
152+
{
153+
rname_pos = it->second;
154+
}
130155

131-
// todo insert into header
156+
if constexpr (std::integral<std::remove_cvref_t<parsed_field_t>>)
157+
parsed_field = rname_pos;
158+
else
159+
parsed_field = header.rnames()[rname_pos];
160+
}
132161
}
133162

134163
/* POS, MAPQ are handled correctly by default, unless we want pos to be read into an std:optional */
@@ -149,7 +178,7 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
149178
auto const res = std::from_chars(ptr, end, cigar_count); // reads number up to next chatacter
150179

151180
if (res.ec != std::errc{}) // parse number
152-
throw format_error{"Corrupted cigar string encountered"}; // todo, how in b.i.o?
181+
error("Corrupted cigar string encountered");
153182

154183
ptr = res.ptr + 1; // skip cigar operation character
155184

@@ -169,7 +198,25 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
169198
if (raw_field == "=")
170199
raw_field = get<field::rname>(raw_record);
171200

172-
parse_field_aux(raw_field, parsed_field);
201+
size_t rname_pos;
202+
203+
if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end())
204+
{ // rname name was not in header, insert!
205+
header.push_back_rname(raw_field, raw_field.size(), "");
206+
207+
rname_pos = header.rnames().size() - 1;
208+
209+
warning("The mates reference sequence name \"", raw_field, "\" is not present in the header.");
210+
}
211+
else
212+
{
213+
rname_pos = it->second;
214+
}
215+
216+
if constexpr (std::integral<std::remove_cvref_t<parsed_field_t>>)
217+
parsed_field = rname_pos;
218+
else
219+
parsed_field = header.rnames()[rname_pos];
173220
}
174221
}
175222

@@ -195,10 +242,11 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
195242
parse_field_aux(raw_field, parsed_field); // reading into e.g. dna4 vector
196243
}
197244

198-
// void parse_field(vtag_t<field::_private> const & /**/, map_io::record_private_data & parsed_field)
199-
// {
200-
// parsed_field = map_io::record_private_data{.header_ptr = &header};
201-
// }
245+
//!\brief Overload for parsing the private data.
246+
void parse_field(vtag_t<field::_private> const & /**/, map_io::record_private_data & parsed_field)
247+
{
248+
parsed_field = map_io::record_private_data{.header_ptr = &header};
249+
}
202250

203251
public:
204252
format_input_handler() = default;
@@ -232,7 +280,7 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
232280
else if (header_string.ends_with("\n"))
233281
header_string = header_string.substr(0, header_string.size() - 1);
234282

235-
header = map_io::header<std::vector<std::string>>{};
283+
header = map_io::header{};
236284
header.read(header_string);
237285
}
238286

include/bio/map_io/header.hpp

Lines changed: 97 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <seqan3/io/sam_file/detail/cigar.hpp>
2929
#include <seqan3/utility/views/single_pass_input.hpp>
3030
#include <seqan3/utility/views/type_reduce.hpp>
31+
#include <seqan3/core/debug_stream.hpp>
3132

3233
#include <bio/exception.hpp>
3334
#include <bio/misc.hpp>
@@ -41,7 +42,6 @@ namespace bio::map_io
4142
*
4243
* TODO
4344
*/
44-
template <typename ref_ids_type = std::vector<std::string>>
4545
class header
4646
{
4747
public:
@@ -56,63 +56,63 @@ class header
5656
header & operator=(header const &) = default; //!< Defaulted.
5757
header & operator=(header &&) = default; //!< Defaulted.
5858

59-
/*!\brief Construct from a range of reference ids which redirects the `ref_ids_ptr` member (non-owning).
59+
/*!\brief Construct from a range of reference ids.
6060
* \param[in] The plain text header.
6161
* \param[in] ref_ids The range over reference ids to redirect the pointer at.
6262
*/
63+
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
6364
header(ref_ids_type & ref_ids) :
64-
ref_ids_ptr{&ref_ids, ref_ids_deleter_noop},
65-
reference_names_present_in_header{true}
66-
{}
65+
reference_names_given_on_construction{true}
66+
{
67+
for (auto & id : ref_ids)
68+
reference_names.emplace_back(id);
69+
}
6770

68-
/*!\brief Construct from a rvalue range of reference ids which is moved into the `ref_ids_ptr` (owning).
71+
/*!\brief Construct from a rvalue range of reference ids. Ids are copied over.
6972
* \param[in] ref_ids The range over reference ids to own.
7073
*/
74+
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
7175
header(ref_ids_type && ref_ids) :
72-
ref_ids_ptr{new ref_ids_type{std::move(ref_ids)}, ref_ids_deleter_default},
73-
reference_names_present_in_header{true}
74-
{}
76+
reference_names_given_on_construction{true}
77+
{
78+
for (auto & id : ref_ids)
79+
{
80+
owned_reference_names.push_back(id);
81+
reference_names.emplace_back(owned_reference_names.back());
82+
}
83+
}
7584
//!\}
7685

7786
//!\brief Defaulted three-way comparison.
7887
auto operator<=>(header const &) const = default;
7988

8089
inline void read(std::string_view header_string);
90+
8191
private:
82-
//!\brief The type of the internal ref_ids pointer. Allows dynamically setting ownership management.
83-
using ref_ids_ptr_t = std::unique_ptr<ref_ids_type, std::function<void(ref_ids_type*)>>;
84-
//!\brief Stream deleter that does nothing (no ownership assumed).
85-
static void ref_ids_deleter_noop(ref_ids_type *) {}
86-
//!\brief Stream deleter with default behaviour (ownership assumed).
87-
static void ref_ids_deleter_default(ref_ids_type * ptr) { delete ptr; }
88-
//!\brief The key's type of ref_dict.
89-
using key_type = std::conditional_t<std::ranges::contiguous_range<std::ranges::range_reference_t<ref_ids_type>>,
90-
std::span<seqan3::range_innermost_value_t<ref_ids_type> const>,
91-
seqan3::type_reduce_t<std::ranges::range_reference_t<ref_ids_type>>>;
92-
//!\brief The pointer to reference ids information (non-owning if reference information is given).
93-
ref_ids_ptr_t ref_ids_ptr{new ref_ids_type{}, ref_ids_deleter_default};
94-
//!\brief Whether reference names are present in the current header.
95-
bool reference_names_present_in_header{false};
96-
97-
//!\brief Custom hash function since std::hash is not defined for all range types (e.g. std::span<char>).
98-
struct key_hasher
99-
{
100-
//!\brief Hash a key.
101-
template <typename key_t>
102-
size_t operator()(key_t && key) const noexcept
103-
{
104-
using char_t = std::ranges::range_value_t<key_t>;
105-
size_t result{0};
106-
std::hash<char_t> h{};
107-
for (char_t character : key)
108-
{
109-
result *= 0x8F3F73B5CF1C9ADE;
110-
result += h(character);
111-
}
112-
return result;
113-
}
114-
};
92+
//!\brief In case no reference information is given on construction, the reference names are stored here.
93+
std::deque<std::string> owned_reference_names;
94+
95+
//!\brief The reference sequence names.
96+
std::vector<std::string_view> reference_names;
97+
98+
std::vector<std::tuple<int32_t, std::string>> reference_names_info{};
11599

100+
//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
101+
std::unordered_map<std::string_view, int32_t> reference_name_to_pos{};
102+
103+
//!\brief Whether reference sequence names were given to the header on construction.
104+
bool reference_names_given_on_construction{false};
105+
106+
/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
107+
{
108+
// if (print_warnings)
109+
// {
110+
// seqan3::debug_stream << "[B.I.O sam format header warning in line " << line << "] "; // todo line numbers
111+
seqan3::debug_stream << "[B.I.O sam format header warning] ";
112+
(seqan3::debug_stream << ... << messages);
113+
seqan3::debug_stream << std::endl;
114+
// }
115+
}
116116
public:
117117
/*!\name [@HD] File-level meta data
118118
* \brief You can directly edit these member variables.
@@ -125,9 +125,10 @@ class header
125125
//!\}
126126

127127
/*!\name [@SQ] Reference sequence dictionary
128-
* \brief You can directly edit these member variables.
128+
* \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
129129
* \{
130130
*/
131+
131132
/*!\brief [@SQ SN] Reference sequence names
132133
*
133134
* \details
@@ -146,9 +147,9 @@ class header
146147
* In this case, the reference information is parsed from the records field::ref_id and stored in the header.
147148
* This member function then provides access to the unique list of reference names encountered in the records.
148149
*/
149-
ref_ids_type & ref_ids()
150+
std::vector<std::string_view> const & rnames()
150151
{
151-
return *ref_ids_ptr;
152+
return reference_names;
152153
}
153154

154155
/*!\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
@@ -182,10 +183,25 @@ class header
182183
* * **UR:** URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:. If
183184
* it does not start with one of these protocols, it is assumed to be a file-system path
184185
*/
185-
std::vector<std::tuple<int32_t, std::string>> ref_id_info{};
186+
std::vector<std::tuple<int32_t, std::string>> const & rnames_info()
187+
{
188+
return reference_names_info;
189+
}
190+
191+
//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
192+
std::unordered_map<std::string_view, int32_t> const & rname_to_pos()
193+
{
194+
return reference_name_to_pos;
195+
}
186196

187-
//!\brief The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
188-
std::unordered_map<key_type, int32_t, key_hasher, seqan3::detail::view_equality_fn> ref_dict{};
197+
//!\brief Append a new rname entry with it's length and extra information and update reference_name_to_pos.
198+
void push_back_rname(std::string_view const rname, int32_t const length, std::string_view const extra_info)
199+
{
200+
owned_reference_names.emplace_back(rname);
201+
reference_names.emplace_back(owned_reference_names.back());
202+
reference_names_info.emplace_back(length, extra_info);
203+
reference_name_to_pos[reference_names.back()] = reference_names.size() - 1;
204+
}
189205
//!\}
190206

191207
/*!\name [@RG] Read groups
@@ -270,8 +286,7 @@ class header
270286
* not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
271287
* error.
272288
*/
273-
template <typename ref_ids_type>
274-
void header<ref_ids_type>::read(std::string_view header_string)
289+
void header::read(std::string_view header_string)
275290
{
276291
auto stream_view = header_string | seqan3::views::single_pass_input;
277292

@@ -379,7 +394,7 @@ void header<ref_ids_type>::read(std::string_view header_string)
379394

380395
case make_tag('S', 'Q'): // SQ (sequence dictionary) tag
381396
{
382-
std::ranges::range_value_t<ref_ids_type> id;
397+
std::string id;
383398
std::tuple<int32_t, std::string> info{};
384399
bool sequence_length_was_present{};
385400

@@ -416,30 +431,30 @@ void header<ref_ids_type>::read(std::string_view header_string)
416431
if (!sequence_length_was_present)
417432
throw format_error{"The required LN tag in @SQ is missing."};
418433

419-
if (reference_names_present_in_header)
420-
{ // If reference information was given, the ids exist and we can fill ref_dict directly.
421-
auto id_it = ref_dict.find(id);
434+
if (!reference_names_given_on_construction)
435+
{ // Reference information was not given by the user but needs to be filled from the header
436+
push_back_rname(id, std::get<0>(info), std::get<1>(info));
437+
}
438+
else
439+
{ // If reference information was given we check them against the once in the header
440+
auto id_it = reference_name_to_pos.find(id);
422441

423-
if (id_it == ref_dict.end())
424-
throw format_error{seqan3::detail::to_string("Unknown reference name '", id, "' found in SAM header ",
425-
"(header.ref_ids(): "/* , ref_ids(), TODO */").")};
442+
if (id_it == reference_name_to_pos.end())
443+
{
444+
warning("The reference sequence name \"", id, "\" was present in the header but not in the "
445+
"user provided rnames.");
426446

427-
auto & given_ref_info = ref_id_info[id_it->second];
447+
push_back_rname(id, std::get<0>(info), std::get<1>(info));
448+
}
449+
else
450+
{
451+
assert(id_it->second < static_cast<decltype(id_it->second)>(reference_names_info.size()));
428452

429-
if (std::get<0>(given_ref_info) != std::get<0>(info))
430-
throw format_error{"Provided and header-based reference length differ."};
453+
if (std::get<0>(reference_names_info[id_it->second]) != std::get<0>(info))
454+
warning("Provided and header-based reference length differ for rname :\"", id, "\".");
431455

432-
ref_id_info[id_it->second] = std::move(info);
433-
}
434-
else
435-
{ // If not, we need to update the ids first and fill the reference dictionary afterwards.
436-
static_assert(!seqan3::detail::is_type_specialisation_of_v<ref_ids_type, std::deque>,
437-
"The range over reference ids must be of type std::deque such that pointers are not "
438-
"invalidated.");
439-
440-
ref_ids().push_back(id);
441-
ref_id_info.push_back(info);
442-
ref_dict[(ref_ids())[(ref_ids()).size() - 1]] = (ref_ids()).size() - 1;
456+
reference_names_info[id_it->second] = info; // update rname information
457+
}
443458
}
444459
break;
445460
}
@@ -552,4 +567,15 @@ void header<ref_ids_type>::read(std::string_view header_string)
552567
}
553568
}
554569

570+
//!\brief A datastructure that contains private data of sam IO records.
571+
//!\ingroup var_io
572+
struct record_private_data
573+
{
574+
//!\privatesection
575+
//!\brief Pointer to the header
576+
header const * header_ptr = nullptr;
577+
578+
friend bool operator==(record_private_data const &, record_private_data const &) = default; //!< Defaulted.
579+
};
580+
555581
} // namespace bio::map_io

0 commit comments

Comments
 (0)