Query individual files#
Here, weโll query individual files and inspect their metadata.
This guide can be skipped if you are only interested in how to leverage the overall dataset.
import lamindb as ln
import lnschema_bionty as lb
import anndata as ad
๐ก loaded instance: testuser1/test-scrna (lamindb 0.55.0)
ln.track()
๐ก notebook imports: anndata==0.9.2 lamindb==0.55.0 lnschema_bionty==0.31.2
๐ก Transform(id='agayZTonayqAz8', name='Query individual files', short_name='scrna3', version='0', type=notebook, updated_at=2023-10-04 16:40:47, created_by_id='DzTjkKse')
๐ก Run(id='Gu0HW6V5keDbDdUUzBSl', run_at=2023-10-04 16:40:47, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')
hello
within hello
Access #
Query files by provenance metadata#
users = ln.User.lookup()
hello
ln.Transform.filter(created_by=users.testuser1).search("scrna")
id | __ratio__ | |
---|---|---|
name | ||
scRNA-seq | Nv48yAceNSh8z8 | 90.0 |
Append a new batch of data | ManDYgmftZ8Cz8 | 36.0 |
Query individual files | agayZTonayqAz8 | 36.0 |
transform = ln.Transform.filter(id="Nv48yAceNSh8z8").one()
ln.File.filter(transform=transform).df()
storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
D1s4vH9rGh45eJ13uQiV | 62dM2FBg | None | .h5ad | AnnData | Conde22 | None | 28049505 | WEFcMZxJNmMiUOFrcSTaig | md5 | Nv48yAceNSh8z8 | zyBkh49M3qcVFVbvsvZ5 | None | 2023-10-04 16:39:54 | DzTjkKse |
Query files based on biological metadata#
assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()
hello
hello
hello
query = ln.File.filter(
experimental_factors=assays.single_cell_rna_sequencing,
species=species.human,
cell_types=cell_types.gamma_delta_t_cell,
)
query.df()
storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
D1s4vH9rGh45eJ13uQiV | 62dM2FBg | None | .h5ad | AnnData | Conde22 | None | 28049505 | WEFcMZxJNmMiUOFrcSTaig | md5 | Nv48yAceNSh8z8 | zyBkh49M3qcVFVbvsvZ5 | None | 2023-10-04 16:39:54 | DzTjkKse |
Transform #
Compare gene sets#
Get file objects:
query = ln.File.filter()
file1, file2 = query.list()
file1.describe()
hello
hello
hello
hello
hello
within hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
File(id='D1s4vH9rGh45eJ13uQiV', suffix='.h5ad', accessor='AnnData', description='Conde22', size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', updated_at=2023-10-04 16:39:54)
Provenance:
๐๏ธ storage: Storage(id='62dM2FBg', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-04 16:38:48, created_by_id='DzTjkKse')
๐ transform: Transform(id='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-10-04 16:38:54, created_by_id='DzTjkKse')
๐ฃ run: Run(id='zyBkh49M3qcVFVbvsvZ5', run_at=2023-10-04 16:38:54, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
๐ค created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-04 16:38:48)
โฌ๏ธ input_of (core.Run): ['2023-10-04 16:40:02']
Features:
var: FeatureSet(id='epYAIDvjkMEktCtQmZlq', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-10-04 16:39:42, modality_id='8B60C3n3', created_by_id='DzTjkKse')
'UBE2V1', 'ZNF407', 'APC2', 'None', 'None', 'LRP2BP-AS1', 'None', 'None', 'None', 'HSPA1B', 'None', 'None', 'PRKG2-AS1', 'SAR1A', 'PIPOX', 'None', 'RPS6KA4', 'None', 'MMRN1', 'ADIRF-AS1', ...
obs: FeatureSet(id='nOcOg6PfZo1yonP1aKOL', n=4, registry='core.Feature', hash='4xEiqlhlgIHH9Nls3xEk', updated_at=2023-10-04 16:39:47, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
๐ donor (12, core.ULabel): 'A52', 'A29', '621B', 'A31', '582C', 'A37', 'A36', '637C', 'D496', 'A35', ...
๐ cell_type (32, bionty.CellType): 'CD4-positive helper T cell', 'lymphocyte', 'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'germinal center B cell', 'progenitor cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'alveolar macrophage', 'CD16-negative, CD56-bright natural killer cell, human', 'macrophage', ...
๐ assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
๐ tissue (17, bionty.Tissue): 'mesenteric lymph node', 'skeletal muscle tissue', 'sigmoid colon', 'duodenum', 'lamina propria', 'lung', 'omentum', 'bone marrow', 'liver', 'spleen', ...
Labels:
๐ท๏ธ species (1, bionty.Species): 'human'
๐ท๏ธ tissues (17, bionty.Tissue): 'mesenteric lymph node', 'skeletal muscle tissue', 'sigmoid colon', 'duodenum', 'lamina propria', 'lung', 'omentum', 'bone marrow', 'liver', 'spleen', ...
๐ท๏ธ cell_types (32, bionty.CellType): 'CD4-positive helper T cell', 'lymphocyte', 'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'germinal center B cell', 'progenitor cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'alveolar macrophage', 'CD16-negative, CD56-bright natural killer cell, human', 'macrophage', ...
๐ท๏ธ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
๐ท๏ธ ulabels (12, core.ULabel): 'A52', 'A29', '621B', 'A31', '582C', 'A37', 'A36', '637C', 'D496', 'A35', ...
file1.view_flow()
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
hello
hello
hello
file2.describe()
hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
File(id='UKDwd6U73Y9ZbFkPS4IC', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=660792, hash='GU-hbSJqGkENOxVKFLmvbA', hash_type='md5', updated_at=2023-10-04 16:40:30)
Provenance:
๐๏ธ storage: Storage(id='62dM2FBg', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-04 16:38:48, created_by_id='DzTjkKse')
๐ transform: Transform(id='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna2', version='0', type='notebook', updated_at=2023-10-04 16:40:02, created_by_id='DzTjkKse')
๐ฃ run: Run(id='5BdVWAv5slv0iNhEPo6T', run_at=2023-10-04 16:40:02, transform_id='ManDYgmftZ8Cz8', created_by_id='DzTjkKse')
๐ค created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-04 16:38:48)
Features:
var: FeatureSet(id='MdeWQREeWEr692zXBk3m', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-10-04 16:40:30, modality_id='8B60C3n3', created_by_id='DzTjkKse')
'CCT6A', 'PLD4', 'ADD3', 'UBALD2', 'BCAS4', 'MPP1', 'SF3B2', 'PLP2', 'ADISSP', 'LY86', 'CDK6', 'ADSL', 'ARHGAP45', 'NKG7', 'UPP1', 'DYNLRB1', 'YWHAB', 'TINF2', 'CFD', 'GATA2', ...
obs: FeatureSet(id='NgdCrmgeLfsl8aREwy1x', n=1, registry='core.Feature', hash='unk4vdl7-1zZScbA6xft', updated_at=2023-10-04 16:40:30, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
๐ cell_type (9, bionty.CellType): 'dendritic cell', 'monocyte', 'CD16-positive, CD56-dim natural killer cell, human', 'mature T cell', 'central memory CD8-positive, alpha-beta T cell', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'B cell, CD19-positive'
external: FeatureSet(id='luodNh86nnjnElRIaj6t', n=2, registry='core.Feature', hash='Xlk8U1DLUjEsqhGGPltC', updated_at=2023-10-04 16:40:30, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
๐ assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
๐ species (1, bionty.Species): 'human'
Labels:
๐ท๏ธ species (1, bionty.Species): 'human'
๐ท๏ธ cell_types (9, bionty.CellType): 'dendritic cell', 'monocyte', 'CD16-positive, CD56-dim natural killer cell, human', 'mature T cell', 'central memory CD8-positive, alpha-beta T cell', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'B cell, CD19-positive'
๐ท๏ธ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
file2.view_flow()
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
hello
hello
within hello
hello
within hello
hello
within hello
hello
within hello
hello
hello
hello
hello
hello
hello
hello
hello
Load files into memory:
file1_adata = file1.load()
file2_adata = file2.load()
Here we compute shared genes without loading files:
file1_genes = file1.features["var"]
file2_genes = file2.features["var"]
shared_genes = file1_genes & file2_genes
len(shared_genes)
hello
within hello
hello
within hello
749
shared_genes.list("symbol")[:10]
['PSMB2',
'EIF4H',
'BTG1',
'EBP',
'APMAP',
'DEK',
'ACAA2',
'MRPL48',
'PARK7',
'TRAPPC1']
Compare cell types#
file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()
shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names
hello
within hello
hello
within hello
['CD8-positive, alpha-beta memory T cell',
'CD16-positive, CD56-dim natural killer cell, human']
We can now subset the two datasets by shared cell types:
file1_adata_subset = file1_adata[
file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset = file2_adata[
file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
Concatenate subsetted datasets:
adata_concat = ad.concat(
[file1_adata_subset, file2_adata_subset],
label="file",
keys=[file1.description, file2.description],
)
adata_concat
AnnData object with n_obs ร n_vars = 244 ร 749
obs: 'cell_type', 'file'
obsm: 'X_umap'
adata_concat.obs.value_counts()
cell_type file
CD8-positive, alpha-beta memory T cell Conde22 120
CD16-positive, CD56-dim natural killer cell, human Conde22 114
CD8-positive, alpha-beta memory T cell 10x reference adata 7
CD16-positive, CD56-dim natural killer cell, human 10x reference adata 3
dtype: int64