Import figshare data into QIIME 2 artifacts#

Author: @ebolyen

import pandas as pd
import hashlib
import qiime2

Create FeatureData[Sequence]#

import tempfile
import requests
import qiime2

data = requests.get("https://www.dropbox.com/s/uqj79nepvub1cxc/tblASVtaxonomy_silva132_v4v5_filter.csv?dl=1")

with tempfile.NamedTemporaryFile() as f:
    f.write(data.content)
    f.flush()
    pd_orig_taxa = pd.read_csv(f.name)

pd_orig_taxa.index = pd_orig_taxa['Sequence'].str.encode('ascii').apply(lambda x: hashlib.md5(x).hexdigest())
pd_orig_taxa
pd_seqs = pd_orig_taxa['Sequence']
pd_seqs
q2_rep_seqs = qiime2.Artifact.import_data('FeatureData[Sequence]', pd_seqs)
q2_rep_seqs
!head {str(q2_rep_seqs._archiver.path / str(q2_rep_seqs.uuid) / 'data' / 'dna-sequences.fasta')}
q2_rep_seqs.save('rep-seqs.qza')

Create FeatureData[Taxonomy]#

def collate_taxa_columns(row):
    assignments = []
    for taxon in ('Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus'):
        prefix = taxon[0].lower() + '__'
        assignment = row[taxon]
        if assignment == '<not present>':
            assignment = ''  # usual 'k__'
        
        assignments.append(prefix + assignment)
        confidence_col = f'Confidence{taxon}'
        if row[confidence_col] < .7:
            break
        else:
            confidence = row[confidence_col]
    new_series = pd.Series(dict(Taxon='; '.join(assignments), Confidence=confidence))
    new_series.name = row.name
    return new_series
        
            
pd_taxonomy = pd_orig_taxa.apply(collate_taxa_columns, axis=1)
pd_taxonomy.index.name = 'Feature ID'
pd_taxonomy
q2_taxonomy = qiime2.Artifact.import_data('FeatureData[Taxonomy]', pd_taxonomy)
q2_taxonomy
!head {str(q2_taxonomy._archiver.path / str(q2_taxonomy.uuid) / 'data' / 'taxonomy.tsv')}
q2_taxonomy.save('taxonomy.qza')

Create FeatureTable[Frequency]#

asv_map = pd_orig_taxa['ASV'].reset_index().set_index('ASV')['Feature ID']
asv_map
data = requests.get("https://www.dropbox.com/s/r5ag9d0lwlcg91n/tblcounts_asv_wide.csv?dl=1")

with tempfile.NamedTemporaryFile() as f:
    f.write(data.content)
    f.flush()
    pd_asv_counts = pd.read_csv(f.name, index_col='ASV').transpose()

#pd_asv_counts = pd.read_csv('tblcounts_asv_wide.csv', index_col='ASV').transpose()
pd_asv_counts = pd_asv_counts.rename(columns=asv_map)
pd_asv_counts
q2_table = qiime2.Artifact.import_data('FeatureTable[Frequency]', pd_asv_counts)
q2_table
!biom summarize-table -i {str(q2_table._archiver.path / str(q2_table.uuid) / 'data' / 'feature-table.biom')} | head
q2_table.save('table.qza')