SeqAn3  3.1.0-rc.1
The Modern C++ library for sequence analysis.
misc_input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <seqan3/std/concepts>
17 #include <seqan3/std/filesystem>
18 #include <iostream>
19 #include <seqan3/std/ranges>
20 #include <seqan3/std/span>
21 #include <string>
22 #include <tuple>
23 
24 #if defined(SEQAN3_HAS_BZIP2)
25  #include <seqan3/contrib/stream/bz2_istream.hpp>
26 #endif
27 #if defined(SEQAN3_HAS_ZLIB)
28  #include <seqan3/contrib/stream/bgzf_istream.hpp>
30  #include <seqan3/contrib/stream/gz_istream.hpp>
31 #endif
33 #include <seqan3/io/exception.hpp>
35 
36 namespace seqan3::detail
37 {
38 
44 template <std::ranges::forward_range ref_t, std::ranges::forward_range query_t>
45 inline bool starts_with(ref_t && reference, query_t && query)
47  requires std::equality_comparable_with<std::ranges::range_reference_t<ref_t>,
48  std::ranges::range_reference_t<query_t>>
50 {
51  auto rit = std::ranges::begin(reference);
52  auto rend = std::ranges::end(reference);
53 
54  auto qit = std::ranges::begin(query);
55  auto qend = std::ranges::end(query);
56 
57  while (true)
58  {
59  if (qit == qend)
60  return true;
61 
62  if (rit == rend)
63  return false;
64 
65  if (*qit != *rit)
66  return false;
67 
68  ++qit;
69  ++rit;
70  }
71 }
72 
80 template <builtin_character char_t>
83 {
84  assert(primary_stream.good());
85 
86  // don't assume ownership
87  constexpr auto stream_deleter_noop = [] (std::basic_istream<char_t> *) {};
88  // assume ownership
89  [[maybe_unused]] constexpr auto stream_deleter_default = [] (std::basic_istream<char_t> * ptr) { delete ptr; };
90 
91  // extract "magic header"
92  std::istreambuf_iterator<char_t> it{primary_stream};
93  std::array<char, bgzf_compression::magic_header.size()> magic_number{}; // Largest magic header from bgzf
94  size_t read_chars = 0;
95  for (; read_chars < magic_number.size(); ++read_chars)
96  {
98  break;
99 
100  magic_number[read_chars] = *it;
101  ++it;
102  }
103 
104  // unget all read chars.
105  for (size_t i = 0 ; i < read_chars; ++i)
106  primary_stream.unget();
107 
108  std::string extension{};
109  if (filename.has_extension())
110  extension = filename.extension().string().substr(1);
111 
112  // tests whether the given extension matches with one of the given compression tags.
113  [[maybe_unused]] auto contains_extension = [] (auto compression_tag, auto const & extension) constexpr
114  {
115  return std::ranges::find(decltype(compression_tag)::file_extensions, extension) !=
116  std::ranges::end(decltype(compression_tag)::file_extensions);
117  };
118 
119  // set return value appropriately
120  if (read_chars == magic_number.size() && bgzf_compression::validate_header(std::span{magic_number})) // BGZF
121  {
122  #if defined(SEQAN3_HAS_ZLIB)
123  if (contains_extension(gz_compression{}, extension) || contains_extension(bgzf_compression{}, extension))
124  filename.replace_extension();
125 
126  return {new contrib::basic_bgzf_istream<char_t>{primary_stream},
127  stream_deleter_default};
128  #else
129  throw file_open_error{"Trying to read from a bgzf file, but no ZLIB available."};
130  #endif
131  }
132  else if (starts_with(magic_number, gz_compression::magic_header)) // GZIP
133  {
134  #if defined(SEQAN3_HAS_ZLIB)
135  if (contains_extension(gz_compression{}, extension) || contains_extension(bgzf_compression{}, extension))
136  filename.replace_extension();
137 
138  return {new contrib::basic_gz_istream<char_t>{primary_stream}, stream_deleter_default};
139  #else
140  throw file_open_error{"Trying to read from a gzipped file, but no ZLIB available."};
141  #endif
142  }
143  else if (starts_with(magic_number, bz2_compression::magic_header)) // BZip2
144  {
145  #if defined(SEQAN3_HAS_BZIP2)
146  if (contains_extension(bz2_compression{}, extension))
147  filename.replace_extension();
148 
149  return {new contrib::basic_bz2_istream<char_t>{primary_stream}, stream_deleter_default};
150  #else
151  throw file_open_error{"Trying to read from a bzipped file, but no libbz2 available."};
152  #endif
153  }
154  else if (starts_with(magic_number, zstd_compression::magic_header)) // ZStd
155  {
156  throw file_open_error{"Trying to read from a zst'ed file, but SeqAn does not yet support this."};
157  }
158 
159  return {&primary_stream, stream_deleter_noop};
160 }
161 
163 template <builtin_character char_t>
165 {
167  return make_secondary_istream(primary_stream, p);
168 }
169 
170 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Provides stream compression utilities.
The Concepts library.
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
auto make_secondary_istream(std::basic_istream< char_t > &primary_stream, std::filesystem::path &filename) -> std::unique_ptr< std::basic_istream< char_t >, std::function< void(std::basic_istream< char_t > *)>>
Depending on the magic bytes of the given stream, return a decompression stream or forward the primar...
Definition: misc_input.hpp:81
bool starts_with(ref_t &&reference, query_t &&query)
Check whether the query range is a prefix of the reference range.
Definition: misc_input.hpp:45
constexpr ptrdiff_t find
Get the index of the first occurrence of a type in a pack.
Definition: traits.hpp:187
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
The internal SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Adaptations of concepts from the Ranges TS.
T size(T... args)
Provides std::span from the C++20 standard library.
A tag signifying a bgzf compressed file.
Definition: magic_header.hpp:74
static bool validate_header(std::span< char_t, extend > header)
Checks if the given header is a bgzf header.
Definition: magic_header.hpp:97
static constexpr std::array< char, 18 > magic_header
The magic byte sequence to disambiguate bgzf compressed files.
Definition: magic_header.hpp:83
A tag signifying a bz2 compressed file.
Definition: magic_header.hpp:46
static constexpr std::array< char, 3 > magic_header
The magic byte sequence to disambiguate bz2 compressed files.
Definition: magic_header.hpp:54
A tag signifying a gz compressed file.
Definition: magic_header.hpp:32
static constexpr std::array< char, 3 > magic_header
The magic byte sequence to disambiguate gz compressed files.
Definition: magic_header.hpp:40
static constexpr std::array< char, 4 > magic_header
The magic byte sequence to disambiguate zstd compressed files.
Definition: magic_header.hpp:68
Thrown if there is an unspecified filesystem or stream error while opening, e.g. permission problem.
Definition: exception.hpp:39