Query individual files#

Here, we’ll query individual files and inspect their metadata.

This guide can be skipped if you are only interested in how to leverage the overall dataset.

import lamindb as ln
import lnschema_bionty as lb
import anndata as ad

💡 loaded instance: testuser1/test-scrna (lamindb 0.55.0)

ln.track()

💡 notebook imports: anndata==0.9.2 lamindb==0.55.0 lnschema_bionty==0.31.2

💡 Transform(id='agayZTonayqAz8', name='Query individual files', short_name='scrna3', version='0', type=notebook, updated_at=2023-10-04 16:40:47, created_by_id='DzTjkKse')

💡 Run(id='Gu0HW6V5keDbDdUUzBSl', run_at=2023-10-04 16:40:47, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')

hello

within hello

Access #

Query files by provenance metadata#

users = ln.User.lookup()

hello

ln.Transform.filter(created_by=users.testuser1).search("scrna")

	id	__ratio__
name
scRNA-seq	Nv48yAceNSh8z8	90.0
Append a new batch of data	ManDYgmftZ8Cz8	36.0
Query individual files	agayZTonayqAz8	36.0

transform = ln.Transform.filter(id="Nv48yAceNSh8z8").one()

ln.File.filter(transform=transform).df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
D1s4vH9rGh45eJ13uQiV	62dM2FBg	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	zyBkh49M3qcVFVbvsvZ5	None	2023-10-04 16:39:54	DzTjkKse

Query files based on biological metadata#

assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()

hello

hello

hello

query = ln.File.filter(
    experimental_factors=assays.single_cell_rna_sequencing,
    species=species.human,
    cell_types=cell_types.gamma_delta_t_cell,
)

query.df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
D1s4vH9rGh45eJ13uQiV	62dM2FBg	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	zyBkh49M3qcVFVbvsvZ5	None	2023-10-04 16:39:54	DzTjkKse

Transform #

Compare gene sets#

Get file objects:

query = ln.File.filter()

file1, file2 = query.list()

file1.describe()

hello

hello

hello

hello

hello

within hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

File(id='D1s4vH9rGh45eJ13uQiV', suffix='.h5ad', accessor='AnnData', description='Conde22', size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', updated_at=2023-10-04 16:39:54)

Provenance:
  🗃️ storage: Storage(id='62dM2FBg', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-04 16:38:48, created_by_id='DzTjkKse')
  📔 transform: Transform(id='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-10-04 16:38:54, created_by_id='DzTjkKse')
  👣 run: Run(id='zyBkh49M3qcVFVbvsvZ5', run_at=2023-10-04 16:38:54, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-04 16:38:48)
  ⬇️ input_of (core.Run): ['2023-10-04 16:40:02']
Features:
  var: FeatureSet(id='epYAIDvjkMEktCtQmZlq', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-10-04 16:39:42, modality_id='8B60C3n3', created_by_id='DzTjkKse')
    'UBE2V1', 'ZNF407', 'APC2', 'None', 'None', 'LRP2BP-AS1', 'None', 'None', 'None', 'HSPA1B', 'None', 'None', 'PRKG2-AS1', 'SAR1A', 'PIPOX', 'None', 'RPS6KA4', 'None', 'MMRN1', 'ADIRF-AS1', ...
  obs: FeatureSet(id='nOcOg6PfZo1yonP1aKOL', n=4, registry='core.Feature', hash='4xEiqlhlgIHH9Nls3xEk', updated_at=2023-10-04 16:39:47, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
    🔗 donor (12, core.ULabel): 'A52', 'A29', '621B', 'A31', '582C', 'A37', 'A36', '637C', 'D496', 'A35', ...
    🔗 cell_type (32, bionty.CellType): 'CD4-positive helper T cell', 'lymphocyte', 'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'germinal center B cell', 'progenitor cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'alveolar macrophage', 'CD16-negative, CD56-bright natural killer cell, human', 'macrophage', ...
    🔗 assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
    🔗 tissue (17, bionty.Tissue): 'mesenteric lymph node', 'skeletal muscle tissue', 'sigmoid colon', 'duodenum', 'lamina propria', 'lung', 'omentum', 'bone marrow', 'liver', 'spleen', ...
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ tissues (17, bionty.Tissue): 'mesenteric lymph node', 'skeletal muscle tissue', 'sigmoid colon', 'duodenum', 'lamina propria', 'lung', 'omentum', 'bone marrow', 'liver', 'spleen', ...
  🏷️ cell_types (32, bionty.CellType): 'CD4-positive helper T cell', 'lymphocyte', 'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell', 'germinal center B cell', 'progenitor cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'alveolar macrophage', 'CD16-negative, CD56-bright natural killer cell, human', 'macrophage', ...
  🏷️ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
  🏷️ ulabels (12, core.ULabel): 'A52', 'A29', '621B', 'A31', '582C', 'A37', 'A36', '637C', 'D496', 'A35', ...

file1.view_flow()

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

hello

hello

hello

https://d33wubrfki0l68.cloudfront.net/b9733a90c0b081b12f370552705a4865b07b16c6/ba034/_images/33517487138dc15b4400bb465d08ac2f7a3b23547ceb855c505bc03b9528fe9a.svg

file2.describe()

hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

File(id='UKDwd6U73Y9ZbFkPS4IC', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=660792, hash='GU-hbSJqGkENOxVKFLmvbA', hash_type='md5', updated_at=2023-10-04 16:40:30)

Provenance:
  🗃️ storage: Storage(id='62dM2FBg', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-04 16:38:48, created_by_id='DzTjkKse')
  📔 transform: Transform(id='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna2', version='0', type='notebook', updated_at=2023-10-04 16:40:02, created_by_id='DzTjkKse')
  👣 run: Run(id='5BdVWAv5slv0iNhEPo6T', run_at=2023-10-04 16:40:02, transform_id='ManDYgmftZ8Cz8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-04 16:38:48)
Features:
  var: FeatureSet(id='MdeWQREeWEr692zXBk3m', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-10-04 16:40:30, modality_id='8B60C3n3', created_by_id='DzTjkKse')
    'CCT6A', 'PLD4', 'ADD3', 'UBALD2', 'BCAS4', 'MPP1', 'SF3B2', 'PLP2', 'ADISSP', 'LY86', 'CDK6', 'ADSL', 'ARHGAP45', 'NKG7', 'UPP1', 'DYNLRB1', 'YWHAB', 'TINF2', 'CFD', 'GATA2', ...
  obs: FeatureSet(id='NgdCrmgeLfsl8aREwy1x', n=1, registry='core.Feature', hash='unk4vdl7-1zZScbA6xft', updated_at=2023-10-04 16:40:30, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
    🔗 cell_type (9, bionty.CellType): 'dendritic cell', 'monocyte', 'CD16-positive, CD56-dim natural killer cell, human', 'mature T cell', 'central memory CD8-positive, alpha-beta T cell', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'B cell, CD19-positive'
  external: FeatureSet(id='luodNh86nnjnElRIaj6t', n=2, registry='core.Feature', hash='Xlk8U1DLUjEsqhGGPltC', updated_at=2023-10-04 16:40:30, modality_id='FnQ7xHJL', created_by_id='DzTjkKse')
    🔗 assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
    🔗 species (1, bionty.Species): 'human'
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ cell_types (9, bionty.CellType): 'dendritic cell', 'monocyte', 'CD16-positive, CD56-dim natural killer cell, human', 'mature T cell', 'central memory CD8-positive, alpha-beta T cell', 'Cd4-negative, CD8_alpha-negative, CD11b-positive dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD8-positive, alpha-beta memory T cell', 'B cell, CD19-positive'
  🏷️ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'

file2.view_flow()

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

hello

hello

within hello

hello

within hello

hello

within hello

hello

within hello

hello

hello

hello

hello

hello

hello

hello

hello

https://d33wubrfki0l68.cloudfront.net/9665bae93efc76eb30a0d3b1e96702b2ac407436/f0f50/_images/81495e3993aaab45a0d15065ba6e5394020264522e4fcb681e49c20ecbab0c41.svg

Load files into memory:

file1_adata = file1.load()
file2_adata = file2.load()

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
len(shared_genes)

hello

within hello

hello

within hello

shared_genes.list("symbol")[:10]

['PSMB2',
 'EIF4H',
 'BTG1',
 'EBP',
 'APMAP',
 'DEK',
 'ACAA2',
 'MRPL48',
 'PARK7',
 'TRAPPC1']

Compare cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

hello

within hello

hello

within hello

['CD8-positive, alpha-beta memory T cell',
 'CD16-positive, CD56-dim natural killer cell, human']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]

Concatenate subsetted datasets:

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

AnnData object with n_obs × n_vars = 244 × 749
    obs: 'cell_type', 'file'
    obsm: 'X_umap'

adata_concat.obs.value_counts()

cell_type                                           file               
CD8-positive, alpha-beta memory T cell              Conde22                120
CD16-positive, CD56-dim natural killer cell, human  Conde22                114
CD8-positive, alpha-beta memory T cell              10x reference adata      7
CD16-positive, CD56-dim natural killer cell, human  10x reference adata      3
dtype: int64