module MiGA::Cli::Action::Download::Base

Helper module including download functions for the *_get actions

Public Instance Methods

cli_base_flags(opt) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 10
def cli_base_flags(opt)
  opt.on(
    '--max-download INT', Integer,
    'Maximum number of datasets to download (by default: unlimited)'
  ) { |v| cli[:max_datasets] = v }
  opt.on(
    '-m', '--metadata STRING',
    'Metadata as key-value pairs separated by = and delimited by comma',
    'Values are saved as strings except for booleans (true / false) or nil'
  ) { |v| cli[:metadata] = v }
end
cli_filters(opt) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 22
def cli_filters(opt)
  opt.on(
    '--exclude PATH',
    'A file with dataset names to exclude'
  ) { |v| cli[:exclude] = v }
  cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
  opt.on(
    '--ignore-until STRING',
    'Ignores all datasets until a name is found (useful for large reruns)'
  ) { |v| cli[:ignore_until] = v }
  opt.on(
    '--ignore-removed',
    'Ignores entries removed from NCBI (by default fails on removed entries)'
  ) { |v| cli[:ignore_removed] = v }
  cli.opt_flag(
    opt, 'get-metadata',
    'Only download and update metadata for existing datasets', :get_md
  )
  opt.on(
    '--updated-before DATE',
    'Only download metadata for datasets last updated before the given date',
    'Requires --get-metadata, supports date or date-time'
  ) { |v| cli[:updated_before] = DateTime.parse(v) }
end
cli_save_actions(opt) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 47
def cli_save_actions(opt)
  cli.opt_flag(
    opt, 'only-metadata',
    'Create datasets without input data but retrieve all metadata',
    :only_md
  )
  opt.on(
    '--save-every INT', Integer,
    'Save project every this many downloaded datasets',
    'If zero, it saves the project only once upon completion',
    "By default: #{cli[:save_every]}"
  ) { |v| cli[:save_every] = v }
  opt.on(
    '-q', '--query',
    'Register the datasets as queries, not reference datasets'
  ) { |v| cli[:query] = v }
  opt.on(
    '-u', '--unlink',
    'Unlink all datasets in the project missing from the download list'
  ) { |v| cli[:unlink] = v }
  opt.on(
    '-R', '--remote-list PATH',
    'Path to an output file with the list of all datasets listed remotely'
  ) { |v| cli[:remote_list] = v }
  opt.on(
    '--ncbi-taxonomy-dump [path]',
    'Path to an NCBI Taxonomy dump directory to query instead of API calls',
    'If the path is not passed, the dump is automatically downloaded'
  ) { |v| cli[:ncbi_taxonomy_dump] = v || true }
end
discard_excluded(ds) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 135
def discard_excluded(ds)
  unless cli[:exclude].nil?
    cli.say "Discarding datasets in #{cli[:exclude]}"
    File.readlines(cli[:exclude])
        .select { |i| i !~ /^#/ }
        .map(&:chomp)
        .each { |i| ds.delete i }
  end
  ds
end
download_entries(ds, p) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 166
def download_entries(ds, p)
  cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
  p.do_not_save = true if cli[:save_every] != 1
  ignore = !cli[:ignore_until].nil?
  downloaded = 0
  d = []
  ds.each do |name, body|
    d << name
    cli.puts name
    ignore = false if ignore && name == cli[:ignore_until]
    next if ignore || p.dataset(name).nil? == cli[:get_md]

    downloaded += 1
    unless cli[:dry]
      unless save_entry(name, body, p)
        downloaded -= 1
        d.pop
        next
      end
      p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
    end
  end
  p.do_not_save = false
  p.save! if cli[:save_every] != 1
  [d, downloaded]
end
exclude_newer(ds) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 146
def exclude_newer(ds)
  return ds unless cli[:updated_before]

  project = cli.load_project
  ds.select do |name|
    d = project.dataset(name)
    d && DateTime.parse(d.metadata[:updated]) < cli[:updated_before]
  end
end
finalize_tasks(d, downloaded) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 119
def finalize_tasks(d, downloaded)
  cli.say "Datasets listed: #{d.size}"
  act = cli[:dry] ? 'to download' : 'downloaded'
  cli.say "Datasets #{act}: #{downloaded}"
  unless cli[:remote_list].nil?
    File.open(cli[:remote_list], 'w') do |fh|
      d.each { |i| fh.puts i }
    end
  end
end
generic_perform() click to toggle source
# File lib/miga/cli/action/download/base.rb, line 78
def generic_perform
  p, ds = load_tasks
  d, downloaded = download_entries(ds, p)

  # Finalize
  finalize_tasks(d, downloaded)
  unlink_entries(p, p.dataset_names - d) if cli[:unlink]
end
impose_limit(ds) click to toggle source
# File lib/miga/cli/action/download/base.rb, line 156
def impose_limit(ds)
  max = cli[:max_datasets].to_i
  if !max.zero? && max < ds.size
    cli.say "Subsampling list from #{ds.size} to #{max} datasets"
    sample = ds.keys.sample(max)
    ds.select! { |k, _| sample.include? k }
  end
  ds
end
load_ncbi_taxonomy_dump() click to toggle source
# File lib/miga/cli/action/download/base.rb, line 98
def load_ncbi_taxonomy_dump
  return unless cli[:ncbi_taxonomy_dump]

  if cli[:ncbi_taxonomy_dump] == true
    cli.say 'Downloading and reading NCBI Taxonomy dump'
    Dir.mktmpdir do |dir|
      file = 'taxdump.tar.gz'
      path = File.join(dir, file)
      url  = 'https://ftp.ncbi.nih.gov/pub/taxonomy/%s' % file
      
      File.open(path, 'wb') { |fh| fh.print MiGA::MiGA.net_method(:get, url) }
      MiGA::MiGA.run_cmd('cd "%s" && tar -zxf "%s"' % [dir, file])
      MiGA::RemoteDataset.use_ncbi_taxonomy_dump(dir, cli)
    end
  else
    cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}"
    MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli)
  end
end
load_tasks() click to toggle source
# File lib/miga/cli/action/download/base.rb, line 87
def load_tasks
  sanitize_cli
  p = cli.load_project
  load_ncbi_taxonomy_dump
  ds = remote_list
  ds = discard_excluded(ds)
  ds = exclude_newer(ds)
  ds = impose_limit(ds)
  [p, ds]
end
save_entry(name, body, p) click to toggle source

Saves the (generic remote) entry identified by name with body into the project p, and returns true on success and false otherwise

# File lib/miga/cli/action/download/base.rb, line 196
def save_entry(name, body, p)
  cli.say "  Locating remote dataset: #{name}"
  body[:md][:metadata_only] = true if cli[:only_md]
  rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
  if cli[:get_md]
    cli.say '  Updating dataset'
    rd.update_metadata(p.dataset(name), body[:md])
  else
    cli.say '  Creating dataset'
    rd.save_to(p, name, !cli[:query], body[:md])
    cli.add_metadata(p.add_dataset(name))
  end
  true
rescue MiGA::RemoteDataMissingError => e
  raise(e) unless cli[:ignore_removed]
  cli.say "    Removed dataset ignored: #{name}"
  false
end