CELLxGENE Census: Real-world Atlas Query

Overview

CELLxGENE Census hosts 60M+ cells across 900+ datasets as a single queryable TileDB-SOMA experiment on public S3. This article shows a practical end-to-end workflow:

  1. Query CD4 T cells from Census via cellxgene.census::get_seurat()
  2. Run a standard Seurat analysis pipeline on the result
  3. Export to h5ad with scConvert::writeH5AD() for Python colleagues

The Census query handles S3 streaming internally; no full-dataset download occurs.

Prerequisites

install.packages("tiledb",
  repos = c("https://tiledb-inc.r-universe.dev", "https://cloud.r-project.org"))
install.packages("tiledbsoma",
  repos = c("https://tiledb-inc.r-universe.dev", "https://cloud.r-project.org"))
install.packages("cellxgene.census",
  repos = c("https://chanzuckerberg.r-universe.dev", "https://cloud.r-project.org"))

Step 1 — Query Census

library(cellxgene.census)

census <- open_soma(census_version = "stable")
cd4_raw <- get_seurat(
  census,
  organism         = "Homo sapiens",
  obs_value_filter = paste(
    "cell_type == 'CD4-positive, alpha-beta T cell'",
    "& tissue_general == 'blood'",
    "& is_primary_data == True"
  ),
  obs_column_names = c(
    "cell_type", "donor_id", "dataset_id",
    "sex", "disease", "tissue_general"
  ),
  var_column_names = c("feature_id", "feature_name")
)

census$close()

# Census returns Ensembl IDs as rownames; rename to gene symbols for usability
counts <- GetAssayData(cd4_raw, assay = "RNA", layer = "counts")
rownames(counts) <- make.unique(cd4_raw[["RNA"]][[]]$feature_name)
cd4 <- CreateSeuratObject(counts = counts, meta.data = cd4_raw[[]])
rm(cd4_raw, counts)

cat("Loaded:", ncol(cd4), "cells x", nrow(cd4), "genes\n")
#> Loaded: 964175 cells x 61497 genes
cat("Donors:", length(unique(cd4$donor_id)), "\n")
#> Donors: 1715
cat("Datasets:", length(unique(cd4$dataset_id)), "\n")
#> Datasets: 35
cd4
#> An object of class Seurat 
#> 61497 features across 964175 samples within 1 assay 
#> Active assay: RNA (61497 features, 0 variable features)
#>  2 layers present: counts, data

Step 2 — Seurat pipeline

cd4 <- NormalizeData(cd4, verbose = FALSE)
cd4 <- FindVariableFeatures(cd4, nfeatures = 2000L, verbose = FALSE)
cd4 <- ScaleData(cd4, verbose = FALSE)
cd4 <- RunPCA(cd4, npcs = 30L, verbose = FALSE)
cd4 <- RunUMAP(cd4, dims = 1:20, verbose = FALSE)
cd4 <- FindNeighbors(cd4, dims = 1:20, verbose = FALSE)
cd4 <- FindClusters(cd4, resolution = 0.4, verbose = FALSE)

Step 3 — Visualisation

UMAP coloured by cluster

DimPlot(cd4, reduction = "umap", label = TRUE, pt.size = 0.3) +
  ggtitle(sprintf("CD4 T cells from CELLxGENE Census (%s cells)", ncol(cd4))) +
  NoLegend()

Key marker expression

CD4, IL7R (memory), CCR7 (naive / central-memory), and FOXP3 (Treg).

FeaturePlot(
  cd4,
  features = c("CD4", "IL7R", "CCR7", "FOXP3"),
  ncol     = 2L,
  pt.size  = 0.2,
  order    = TRUE
)

Disease composition

disease_counts <- sort(table(cd4$disease), decreasing = TRUE)
df <- data.frame(
  disease = names(disease_counts),
  n       = as.integer(disease_counts)
)
ggplot(df, aes(x = reorder(disease, n), y = n)) +
  geom_col(fill = "#4E79A7") +
  coord_flip() +
  labs(x = NULL, y = "Cells", title = "Disease annotation (Census metadata)") +
  theme_classic(base_size = 11)

Step 4 — Export to h5ad

h5ad_path <- file.path(tempdir(), "census_cd4.h5ad")
writeH5AD(cd4, h5ad_path)
cat("Written:", h5ad_path, "\n")
#> Written: /var/folders/9l/bl67cpdj3rzgkx2pfk0flmhc0000gn/T//RtmpuOkTE9/census_cd4.h5ad
cat("File size:", round(file.info(h5ad_path)$size / 1e6, 1L), "MB\n")
#> File size: 5423.8 MB

Verify round-trip

cd4_rt <- readH5AD(h5ad_path)
stopifnot(
  ncol(cd4_rt) == ncol(cd4),
  nrow(cd4_rt) == nrow(cd4)
)
cat("Round-trip OK:", ncol(cd4_rt), "cells x", nrow(cd4_rt), "genes\n")
#> Round-trip OK: 964175 cells x 61497 genes
cat("Clusters preserved:", all(cd4_rt$seurat_clusters == cd4$seurat_clusters), "\n")
#> Clusters preserved: TRUE

Key takeaways

Session Info

sessionInfo()
#> R version 4.6.0 (2026-04-24)
#> Platform: aarch64-apple-darwin23
#> Running under: macOS Tahoe 26.3
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRblas.0.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.6/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: America/Indiana/Indianapolis
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] future_1.70.0           RcppSpdlog_0.0.28       cellxgene.census_1.16.1
#> [4] ggplot2_4.0.3           Seurat_5.5.0            SeuratObject_5.4.0     
#> [7] sp_2.2-1                scConvert_0.2.0        
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3     jsonlite_2.0.0         magrittr_2.0.5        
#>   [4] ggbeeswarm_0.7.3       spatstat.utils_3.2-2   farver_2.1.2          
#>   [7] rmarkdown_2.31         vctrs_0.7.3            ROCR_1.0-12           
#>  [10] Cairo_1.7-0            spatstat.explore_3.8-0 base64enc_0.1-6       
#>  [13] htmltools_0.5.9        curl_7.1.0             sass_0.4.10           
#>  [16] sctransform_0.4.3      parallelly_1.47.0      KernSmooth_2.23-26    
#>  [19] bslib_0.10.0           htmlwidgets_1.6.4      ica_1.0-3             
#>  [22] plyr_1.8.9             plotly_4.12.0          zoo_1.8-15            
#>  [25] cachem_1.1.0           igraph_2.3.1           mime_0.13             
#>  [28] lifecycle_1.0.5        pkgconfig_2.0.3        Matrix_1.7-5          
#>  [31] R6_2.6.1               fastmap_1.2.0          MatrixGenerics_1.24.0 
#>  [34] fitdistrplus_1.2-6     shiny_1.13.0           digest_0.6.39         
#>  [37] tiledb_0.34.0          S4Vectors_0.50.0       patchwork_1.3.2       
#>  [40] tensor_1.5.1           RSpectra_0.16-2        irlba_2.3.7           
#>  [43] GenomicRanges_1.64.0   aws.signature_0.6.0    labeling_0.4.3        
#>  [46] progressr_0.19.0       spatstat.sparse_3.1-0  httr_1.4.8            
#>  [49] polyclip_1.10-7        abind_1.4-8            compiler_4.6.0        
#>  [52] bit64_4.8.0            withr_3.0.2            S7_0.2.2              
#>  [55] fastDummies_1.7.6      MASS_7.3-65            tiledbsoma_2.3.0      
#>  [58] tools_4.6.0            vipor_0.4.7            lmtest_0.9-40         
#>  [61] otel_0.2.0             beeswarm_0.4.0         httpuv_1.6.17         
#>  [64] future.apply_1.20.2    goftest_1.2-3          glue_1.8.1            
#>  [67] nlme_3.1-169           promises_1.5.0         grid_4.6.0            
#>  [70] Rtsne_0.17             cluster_2.1.8.2        reshape2_1.4.5        
#>  [73] generics_0.1.4         hdf5r_1.3.12           gtable_0.3.6          
#>  [76] spatstat.data_3.1-9    tidyr_1.3.2            data.table_1.18.4     
#>  [79] xml2_1.5.2             BiocGenerics_0.58.0    BPCells_0.3.1         
#>  [82] spatstat.geom_3.7-3    RcppAnnoy_0.0.23       ggrepel_0.9.8         
#>  [85] RANN_2.6.2             pillar_1.11.1          stringr_1.6.0         
#>  [88] nanoarrow_0.8.0        spam_2.11-3            RcppHNSW_0.6.0        
#>  [91] later_1.4.8            splines_4.6.0          dplyr_1.2.1           
#>  [94] lattice_0.22-9         survival_3.8-6         bit_4.6.0             
#>  [97] deldir_2.0-4           tidyselect_1.2.1       miniUI_0.1.2          
#> [100] pbapply_1.7-4          knitr_1.51             gridExtra_2.3         
#> [103] Seqinfo_1.2.0          IRanges_2.46.0         RcppCCTZ_0.2.14       
#> [106] scattermore_1.2        stats4_4.6.0           xfun_0.57             
#> [109] matrixStats_1.5.0      stringi_1.8.7          lazyeval_0.2.3        
#> [112] yaml_2.3.12            evaluate_1.0.5         codetools_0.2-20      
#> [115] tibble_3.3.1           cli_3.6.6              uwot_0.2.4            
#> [118] arrow_24.0.0           xtable_1.8-8           reticulate_1.46.0     
#> [121] jquerylib_0.1.4        dichromat_2.0-0.1      Rcpp_1.1.1-1.1        
#> [124] globals_0.19.1         spatstat.random_3.4-5  png_0.1-9             
#> [127] ggrastr_1.0.2          spatstat.univar_3.1-7  parallel_4.6.0        
#> [130] assertthat_0.2.1       dotCall64_1.2          aws.s3_0.3.22         
#> [133] spdl_0.0.5             listenv_0.10.1         viridisLite_0.4.3     
#> [136] scales_1.4.0           ggridges_0.5.7         purrr_1.2.2           
#> [139] crayon_1.5.3           rlang_1.2.0            cowplot_1.2.0         
#> [142] nanotime_0.3.14