2828#include < seqan3/io/sam_file/detail/cigar.hpp>
2929#include < seqan3/utility/views/single_pass_input.hpp>
3030#include < seqan3/utility/views/type_reduce.hpp>
31+ #include < seqan3/core/debug_stream.hpp>
3132
3233#include < bio/exception.hpp>
3334#include < bio/misc.hpp>
@@ -41,7 +42,6 @@ namespace bio::map_io
4142 *
4243 * TODO
4344 */
44- template <typename ref_ids_type = std::vector<std::string>>
4545class header
4646{
4747public:
@@ -56,63 +56,63 @@ class header
5656 header & operator =(header const &) = default ; // !< Defaulted.
5757 header & operator =(header &&) = default ; // !< Defaulted.
5858
59- /* !\brief Construct from a range of reference ids which redirects the `ref_ids_ptr` member (non-owning) .
59+ /* !\brief Construct from a range of reference ids.
6060 * \param[in] The plain text header.
6161 * \param[in] ref_ids The range over reference ids to redirect the pointer at.
6262 */
63+ template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
6364 header (ref_ids_type & ref_ids) :
64- ref_ids_ptr{&ref_ids, ref_ids_deleter_noop},
65- reference_names_present_in_header{true }
66- {}
65+ reference_names_given_on_construction{true }
66+ {
67+ for (auto & id : ref_ids)
68+ reference_names.emplace_back (id);
69+ }
6770
68- /* !\brief Construct from a rvalue range of reference ids which is moved into the `ref_ids_ptr` (owning) .
71+ /* !\brief Construct from a rvalue range of reference ids. Ids are copied over .
6972 * \param[in] ref_ids The range over reference ids to own.
7073 */
74+ template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
7175 header (ref_ids_type && ref_ids) :
72- ref_ids_ptr{new ref_ids_type{std::move (ref_ids)}, ref_ids_deleter_default},
73- reference_names_present_in_header{true }
74- {}
76+ reference_names_given_on_construction{true }
77+ {
78+ for (auto & id : ref_ids)
79+ {
80+ owned_reference_names.push_back (id);
81+ reference_names.emplace_back (owned_reference_names.back ());
82+ }
83+ }
7584 // !\}
7685
7786 // !\brief Defaulted three-way comparison.
7887 auto operator <=>(header const &) const = default ;
7988
8089 inline void read (std::string_view header_string);
90+
8191private:
82- // !\brief The type of the internal ref_ids pointer. Allows dynamically setting ownership management.
83- using ref_ids_ptr_t = std::unique_ptr<ref_ids_type, std::function<void (ref_ids_type*)>>;
84- // !\brief Stream deleter that does nothing (no ownership assumed).
85- static void ref_ids_deleter_noop (ref_ids_type *) {}
86- // !\brief Stream deleter with default behaviour (ownership assumed).
87- static void ref_ids_deleter_default (ref_ids_type * ptr) { delete ptr; }
88- // !\brief The key's type of ref_dict.
89- using key_type = std::conditional_t <std::ranges::contiguous_range<std::ranges::range_reference_t <ref_ids_type>>,
90- std::span<seqan3::range_innermost_value_t <ref_ids_type> const >,
91- seqan3::type_reduce_t <std::ranges::range_reference_t <ref_ids_type>>>;
92- // !\brief The pointer to reference ids information (non-owning if reference information is given).
93- ref_ids_ptr_t ref_ids_ptr{new ref_ids_type{}, ref_ids_deleter_default};
94- // !\brief Whether reference names are present in the current header.
95- bool reference_names_present_in_header{false };
96-
97- // !\brief Custom hash function since std::hash is not defined for all range types (e.g. std::span<char>).
98- struct key_hasher
99- {
100- // !\brief Hash a key.
101- template <typename key_t >
102- size_t operator ()(key_t && key) const noexcept
103- {
104- using char_t = std::ranges::range_value_t <key_t >;
105- size_t result{0 };
106- std::hash<char_t > h{};
107- for (char_t character : key)
108- {
109- result *= 0x8F3F73B5CF1C9ADE ;
110- result += h (character);
111- }
112- return result;
113- }
114- };
92+ // !\brief In case no reference information is given on construction, the reference names are stored here.
93+ std::deque<std::string> owned_reference_names;
94+
95+ // !\brief The reference sequence names.
96+ std::vector<std::string_view> reference_names;
97+
98+ std::vector<std::tuple<int32_t , std::string>> reference_names_info{};
11599
100+ // !\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
101+ std::unordered_map<std::string_view, int32_t > reference_name_to_pos{};
102+
103+ // !\brief Whether reference sequence names were given to the header on construction.
104+ bool reference_names_given_on_construction{false };
105+
106+ /* [[noreturn]] compiler says this returns something...? */ void warning (auto const &... messages) const
107+ {
108+ // if (print_warnings)
109+ // {
110+ // seqan3::debug_stream << "[B.I.O sam format header warning in line " << line << "] "; // todo line numbers
111+ seqan3::debug_stream << " [B.I.O sam format header warning] " ;
112+ (seqan3::debug_stream << ... << messages);
113+ seqan3::debug_stream << std::endl;
114+ // }
115+ }
116116public:
117117 /* !\name [@HD] File-level meta data
118118 * \brief You can directly edit these member variables.
@@ -125,9 +125,10 @@ class header
125125 // !\}
126126
127127 /* !\name [@SQ] Reference sequence dictionary
128- * \brief You can directly edit these member variables.
128+ * \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers .
129129 * \{
130130 */
131+
131132 /* !\brief [@SQ SN] Reference sequence names
132133 *
133134 * \details
@@ -146,9 +147,9 @@ class header
146147 * In this case, the reference information is parsed from the records field::ref_id and stored in the header.
147148 * This member function then provides access to the unique list of reference names encountered in the records.
148149 */
149- ref_ids_type & ref_ids ()
150+ std::vector<std::string_view> const & rnames ()
150151 {
151- return *ref_ids_ptr ;
152+ return reference_names ;
152153 }
153154
154155 /* !\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
@@ -182,10 +183,25 @@ class header
182183 * * **UR:** URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:. If
183184 * it does not start with one of these protocols, it is assumed to be a file-system path
184185 */
185- std::vector<std::tuple<int32_t , std::string>> ref_id_info{};
186+ std::vector<std::tuple<int32_t , std::string>> const & rnames_info ()
187+ {
188+ return reference_names_info;
189+ }
190+
191+ // !\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
192+ std::unordered_map<std::string_view, int32_t > const & rname_to_pos ()
193+ {
194+ return reference_name_to_pos;
195+ }
186196
187- // !\brief The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
188- std::unordered_map<key_type, int32_t , key_hasher, seqan3::detail::view_equality_fn> ref_dict{};
197+ // !\brief Append a new rname entry with it's length and extra information and update reference_name_to_pos.
198+ void push_back_rname (std::string_view const rname, int32_t const length, std::string_view const extra_info)
199+ {
200+ owned_reference_names.emplace_back (rname);
201+ reference_names.emplace_back (owned_reference_names.back ());
202+ reference_names_info.emplace_back (length, extra_info);
203+ reference_name_to_pos[reference_names.back ()] = reference_names.size () - 1 ;
204+ }
189205 // !\}
190206
191207 /* !\name [@RG] Read groups
@@ -270,8 +286,7 @@ class header
270286 * not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
271287 * error.
272288 */
273- template <typename ref_ids_type>
274- void header<ref_ids_type>::read(std::string_view header_string)
289+ void header::read (std::string_view header_string)
275290{
276291 auto stream_view = header_string | seqan3::views::single_pass_input;
277292
@@ -379,7 +394,7 @@ void header<ref_ids_type>::read(std::string_view header_string)
379394
380395 case make_tag (' S' , ' Q' ): // SQ (sequence dictionary) tag
381396 {
382- std::ranges:: range_value_t <ref_ids_type> id;
397+ std::string id;
383398 std::tuple<int32_t , std::string> info{};
384399 bool sequence_length_was_present{};
385400
@@ -416,30 +431,30 @@ void header<ref_ids_type>::read(std::string_view header_string)
416431 if (!sequence_length_was_present)
417432 throw format_error{" The required LN tag in @SQ is missing." };
418433
419- if (reference_names_present_in_header)
420- { // If reference information was given, the ids exist and we can fill ref_dict directly.
421- auto id_it = ref_dict.find (id);
434+ if (!reference_names_given_on_construction)
435+ { // Reference information was not given by the user but needs to be filled from the header
436+ push_back_rname (id, std::get<0 >(info), std::get<1 >(info));
437+ }
438+ else
439+ { // If reference information was given we check them against the once in the header
440+ auto id_it = reference_name_to_pos.find (id);
422441
423- if (id_it == ref_dict.end ())
424- throw format_error{seqan3::detail::to_string (" Unknown reference name '" , id, " ' found in SAM header " ,
425- " (header.ref_ids(): " /* , ref_ids(), TODO */ " )." )};
442+ if (id_it == reference_name_to_pos.end ())
443+ {
444+ warning (" The reference sequence name \" " , id, " \" was present in the header but not in the "
445+ " user provided rnames." );
426446
427- auto & given_ref_info = ref_id_info[id_it->second ];
447+ push_back_rname (id, std::get<0 >(info), std::get<1 >(info));
448+ }
449+ else
450+ {
451+ assert (id_it->second < static_cast <decltype (id_it->second )>(reference_names_info.size ()));
428452
429- if (std::get<0 >(given_ref_info ) != std::get<0 >(info))
430- throw format_error{ " Provided and header-based reference length differ. " } ;
453+ if (std::get<0 >(reference_names_info[id_it-> second ] ) != std::get<0 >(info))
454+ warning ( " Provided and header-based reference length differ for rname : \" " , id, " \" . " ) ;
431455
432- ref_id_info[id_it->second ] = std::move (info);
433- }
434- else
435- { // If not, we need to update the ids first and fill the reference dictionary afterwards.
436- static_assert (!seqan3::detail::is_type_specialisation_of_v<ref_ids_type, std::deque>,
437- " The range over reference ids must be of type std::deque such that pointers are not "
438- " invalidated." );
439-
440- ref_ids ().push_back (id);
441- ref_id_info.push_back (info);
442- ref_dict[(ref_ids ())[(ref_ids ()).size () - 1 ]] = (ref_ids ()).size () - 1 ;
456+ reference_names_info[id_it->second ] = info; // update rname information
457+ }
443458 }
444459 break ;
445460 }
@@ -552,4 +567,15 @@ void header<ref_ids_type>::read(std::string_view header_string)
552567 }
553568}
554569
570+ // !\brief A datastructure that contains private data of sam IO records.
571+ // !\ingroup var_io
572+ struct record_private_data
573+ {
574+ // !\privatesection
575+ // !\brief Pointer to the header
576+ header const * header_ptr = nullptr ;
577+
578+ friend bool operator ==(record_private_data const &, record_private_data const &) = default ; // !< Defaulted.
579+ };
580+
555581} // namespace bio::map_io
0 commit comments