-
Notifications
You must be signed in to change notification settings - Fork 1
/
fastq_subsampler.rb
executable file
·77 lines (54 loc) · 1.96 KB
/
fastq_subsampler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env ruby
require 'biopieces'
require 'optparse'
ARGV << "-h" if ARGV.empty?
options = {}
OptionParser.new do |opts|
opts.banner = <<USAGE
Subsamples read pairs from interleaved FASTQ file.
Usage: #{File.basename(__FILE__)} [options] <FASTQ file>
USAGE
opts.on("-h", "--help", "Display this screen" ) do
$stderr.puts opts
exit
end
opts.on("-n", "--number <int>", Integer, "Number of read pairs to subsample") do |o|
options[:number] = o
end
opts.on("-o", "--output <file>", String, "Name of output file") do |o|
options[:output] = o
end
opts.on("-v", "--verbose", "Verbose output") do |o|
options[:verbose] = o
end
end.parse!
raise OptionParser::MissingArgument, "No number specified." unless options[:number]
raise OptionParser::InvalidArgument, "Number must be >= 2 - not #{options[:number]}" unless options[:number] >= 2
raise OptionParser::InvalidArgument, "Number must be even - not #{options[:number]}" unless options[:number].even?
file = ARGV.dup.first
$stderr.puts "Processing file: #{file}" if options[:verbose]
`wc -l #{file}` =~ /^\s*(\d+)/
lines = $1.to_i
records = lines / 4
raise "Requested number of random records > number of records: #{options[:number]} > #{records}" if options[:number] > records
vector = (0 .. records).to_a.shuffle.select { |i| i.even? }.first(options[:number] / 2).sort
max = vector.max
random = {}
vector.map {|i| random[i] = true }
i = 0
selected = 0
BioPieces::Fastq.open(options[:output], 'w') do |output|
BioPieces::Fastq.open(file) do |input|
input.each_slice(2) do |entry1, entry2|
if random[i]
output.puts entry1.to_fastq
output.puts entry2.to_fastq
selected += 2
end
i += 2
$stderr.puts "Processed: #{i} selected: #{selected}" if (i % 10_000) == 0 and options[:verbose]
break if i > max
end
end
end
$stderr.puts "Processed: #{i} selected: #{selected}" if options[:verbose]