module MiGA::Common::Format

General formatting functions shared throughout MiGA.

Public Instance Methods

clean_fasta_file(file, min_len = 1) click to toggle source

Cleans a FastA file in place, removing all sequences shorter than min_len

# File lib/miga/common/format.rb, line 28
def clean_fasta_file(file, min_len = 1)
  tmp_fh = nil
  tmp_path = nil
  begin
    if file =~ /\.gz/
      tmp_path = Tempfile.new('MiGA.gz').tap(&:close).path
      File.unlink tmp_path
      tmp_path += '.gz'
      tmp_fh = Zlib::GzipWriter.open(tmp_path, 9)
      fh = Zlib::GzipReader.open(file)
    else
      tmp_fh = Tempfile.new('MiGA')
      tmp_path = tmp_fh.path
      fh = File.open(file, 'r')
    end
    next_seq = ['', '']
    fh.each_line do |ln|
      ln.chomp!
      if ln =~ /^>\s*(\S+)(.*)/
        id, df = $1, $2
        if next_seq[1].length >= min_len
          tmp_fh.puts next_seq[0]
          tmp_fh.print next_seq[1].wrap_width(80)
        end
        next_seq = [">#{id.gsub(/[^A-Za-z0-9_\|\.]/, '_')}#{df}", '']
      else
        next_seq[1] += ln.gsub(/[^A-Za-z\.\-]/, '')
      end
    end
    if next_seq[1].length >= min_len
      tmp_fh.puts next_seq[0]
      tmp_fh.print next_seq[1].wrap_width(80)
    end
    tmp_fh.close
    fh.close
    FileUtils.mv(tmp_path, file)
  ensure
    begin
      tmp_fh.close unless tmp_fh.nil?
      File.unlink(tmp_path) unless tmp_path.nil?
    rescue
    end
  end
end
seqs_length(file, format, opts = {}) click to toggle source

Calculates the average and standard deviation of the sequence lengths in a FastA or FastQ file (supports gzipped files). The format must be a Symbol, one of :fasta or :fastq. Additional estimations can be controlled via the opts Hash. Supported options include:

  • :n50: Include the N50 and the median (in bp)

  • :gc: Include the G+C content (in %)

  • :x: Include the undetermined bases content (in %)

  • :skew: Include G-C and A-T sequence skew (in %; forces gc: true). See definition used here in DOI:10.1177/117693430700300006

# File lib/miga/common/format.rb, line 83
def seqs_length(file, format, opts = {})
  opts[:gc] = true if opts[:skew]
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
  l = []
  gc = 0
  xn = 0
  t  = 0
  c  = 0
  i  = 0 # <- Zlib::GzipReader doesn't set `$.`
  fh.each_line do |ln|
    i += 1
    if (format == :fasta and ln =~ /^>/) or
       (format == :fastq and (i % 4) == 1)
      l << 0
    elsif format == :fasta or (i % 4) == 2
      l[l.size - 1] += ln.chomp.size
      gc += ln.scan(/[GCgc]/).count if opts[:gc]
      xn += ln.scan(/[XNxn]/).count if opts[:x]
      if opts[:skew]
        t += ln.scan(/[Tt]/).count
        c += ln.scan(/[Cc]/).count
      end
    end
  end
  fh.close

  o = { n: l.size, tot: l.inject(0, :+), max: l.max }
  return o if o[:tot].zero?
  o[:avg] = o[:tot].to_f / l.size
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
  o[:sd]  = Math.sqrt o[:var]
  o[:gc]  = 100.0 * gc / o[:tot] if opts[:gc]
  o[:x]   = 100.0 * xn / o[:tot] if opts[:x]
  if opts[:skew]
    at = o[:tot] - gc
    o[:at_skew] = 100.0 * (2 * t - at) / at
    o[:gc_skew] = 100.0 * (2 * c - gc) / gc
  end

  if opts[:n50]
    l.sort!
    thr = o[:tot] / 2
    pos = 0
    l.each do |a|
      pos += a
      o[:n50] = a
      break if pos >= thr
    end
    o[:med] = o[:n].even? ?
          0.5 * l[o[:n] / 2 - 1, 2].inject(:+) :
          l[(o[:n] - 1) / 2]
  end
  o
end
tabulate(header, values, tabular = false) click to toggle source

Tabulates an values, and Array of Arrays, all with the same number of entries as header. Returns an Array of String, one per line.

# File lib/miga/common/format.rb, line 11
def tabulate(header, values, tabular = false)
  fields = []
  fields << header.map(&:to_s) unless tabular && header.all?(&:nil?)
  fields << fields.first.map { |h| h.gsub(/\S/, '-') } unless tabular
  fields += values.map { |r| r.map { |cell| cell.nil? ? '?' : cell.to_s } }
  clen = tabular ? Array.new(header.size, 0) :
        fields.map { |r| r.map(&:length) }.transpose.map(&:max)
  fields.map do |r|
    (0..(clen.size - 1)).map do |col_n|
      col_n == 0 ? r[col_n].rjust(clen[col_n]) : r[col_n].ljust(clen[col_n])
    end.join(tabular ? "\t" : '  ')
  end
end