Helper module including download functions for the *_get actions
# File lib/miga/cli/action/download/base.rb, line 10 def cli_base_flags(opt) opt.on( '--max-download INT', Integer, 'Maximum number of datasets to download (by default: unlimited)' ) { |v| cli[:max_datasets] = v } opt.on( '-m', '--metadata STRING', 'Metadata as key-value pairs separated by = and delimited by comma', 'Values are saved as strings except for booleans (true / false) or nil' ) { |v| cli[:metadata] = v } end
# File lib/miga/cli/action/download/base.rb, line 22 def cli_filters(opt) opt.on( '--exclude PATH', 'A file with dataset names to exclude' ) { |v| cli[:exclude] = v } cli.opt_flag(opt, 'dry', 'Do not download or save the datasets') opt.on( '--ignore-until STRING', 'Ignores all datasets until a name is found (useful for large reruns)' ) { |v| cli[:ignore_until] = v } opt.on( '--ignore-removed', 'Ignores entries removed from NCBI (by default fails on removed entries)' ) { |v| cli[:ignore_removed] = v } cli.opt_flag( opt, 'get-metadata', 'Only download and update metadata for existing datasets', :get_md ) opt.on( '--updated-before DATE', 'Only download metadata for datasets last updated before the given date', 'Requires --get-metadata, supports date or date-time' ) { |v| cli[:updated_before] = DateTime.parse(v) } end
# File lib/miga/cli/action/download/base.rb, line 47 def cli_save_actions(opt) cli.opt_flag( opt, 'only-metadata', 'Create datasets without input data but retrieve all metadata', :only_md ) opt.on( '--save-every INT', Integer, 'Save project every this many downloaded datasets', 'If zero, it saves the project only once upon completion', "By default: #{cli[:save_every]}" ) { |v| cli[:save_every] = v } opt.on( '-q', '--query', 'Register the datasets as queries, not reference datasets' ) { |v| cli[:query] = v } opt.on( '-u', '--unlink', 'Unlink all datasets in the project missing from the download list' ) { |v| cli[:unlink] = v } opt.on( '-R', '--remote-list PATH', 'Path to an output file with the list of all datasets listed remotely' ) { |v| cli[:remote_list] = v } opt.on( '--ncbi-taxonomy-dump [path]', 'Path to an NCBI Taxonomy dump directory to query instead of API calls', 'If the path is not passed, the dump is automatically downloaded' ) { |v| cli[:ncbi_taxonomy_dump] = v || true } end
# File lib/miga/cli/action/download/base.rb, line 135 def discard_excluded(ds) unless cli[:exclude].nil? cli.say "Discarding datasets in #{cli[:exclude]}" File.readlines(cli[:exclude]) .select { |i| i !~ /^#/ } .map(&:chomp) .each { |i| ds.delete i } end ds end
# File lib/miga/cli/action/download/base.rb, line 166 def download_entries(ds, p) cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries') p.do_not_save = true if cli[:save_every] != 1 ignore = !cli[:ignore_until].nil? downloaded = 0 d = [] ds.each do |name, body| d << name cli.puts name ignore = false if ignore && name == cli[:ignore_until] next if ignore || p.dataset(name).nil? == cli[:get_md] downloaded += 1 unless cli[:dry] unless save_entry(name, body, p) downloaded -= 1 d.pop next end! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero? end end p.do_not_save = false! if cli[:save_every] != 1 [d, downloaded] end
# File lib/miga/cli/action/download/base.rb, line 146 def exclude_newer(ds) return ds unless cli[:updated_before] project = cli.load_project do |name| d = project.dataset(name) d && DateTime.parse(d.metadata[:updated]) < cli[:updated_before] end end
# File lib/miga/cli/action/download/base.rb, line 119 def finalize_tasks(d, downloaded) cli.say "Datasets listed: #{d.size}" act = cli[:dry] ? 'to download' : 'downloaded' cli.say "Datasets #{act}: #{downloaded}" unless cli[:remote_list].nil?[:remote_list], 'w') do |fh| d.each { |i| fh.puts i } end end end
# File lib/miga/cli/action/download/base.rb, line 78 def generic_perform p, ds = load_tasks d, downloaded = download_entries(ds, p) # Finalize finalize_tasks(d, downloaded) unlink_entries(p, p.dataset_names - d) if cli[:unlink] end
# File lib/miga/cli/action/download/base.rb, line 156 def impose_limit(ds) max = cli[:max_datasets].to_i if ! && max < ds.size cli.say "Subsampling list from #{ds.size} to #{max} datasets" sample = ds.keys.sample(max)! { |k, _| sample.include? k } end ds end
# File lib/miga/cli/action/download/base.rb, line 98 def load_ncbi_taxonomy_dump return unless cli[:ncbi_taxonomy_dump] if cli[:ncbi_taxonomy_dump] == true cli.say 'Downloading and reading NCBI Taxonomy dump' Dir.mktmpdir do |dir| file = 'taxdump.tar.gz' path = File.join(dir, file) url = '' % file, 'wb') { |fh| fh.print MiGA::MiGA.net_method(:get, url) } MiGA::MiGA.run_cmd('cd "%s" && tar -zxf "%s"' % [dir, file]) MiGA::RemoteDataset.use_ncbi_taxonomy_dump(dir, cli) end else cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}" MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli) end end
# File lib/miga/cli/action/download/base.rb, line 87 def load_tasks sanitize_cli p = cli.load_project load_ncbi_taxonomy_dump ds = remote_list ds = discard_excluded(ds) ds = exclude_newer(ds) ds = impose_limit(ds) [p, ds] end
# File lib/miga/cli/action/download/base.rb, line 196 def save_entry(name, body, p) cli.say " Locating remote dataset: #{name}" body[:md][:metadata_only] = true if cli[:only_md] rd =[:ids], body[:db], body[:universe]) if cli[:get_md] cli.say ' Updating dataset' rd.update_metadata(p.dataset(name), body[:md]) else cli.say ' Creating dataset' rd.save_to(p, name, !cli[:query], body[:md]) cli.add_metadata(p.add_dataset(name)) end true rescue MiGA::RemoteDataMissingError => e raise(e) unless cli[:ignore_removed] cli.say " Removed dataset ignored: #{name}" false end
# File lib/miga/cli/action/download/base.rb, line 130 def unlink_entries(p, unlink) unlink.each { |i| p.unlink_dataset(i).remove! } cli.say "Datasets unlinked: #{unlink.size}" end