Accessing data from the IGVF Catalog (original) (raw)

Compute overlap with plyranges

We can query genomic elements using elements() and then compute overlaps with the plyranges package as below. See the plyranges package documentation for more examples.

library(plyranges)
rng <- data.frame(seqnames="chr1", start=10e6+1, end=10.1e6) |>
  as_granges()

e <- elements(rng, limit=200L) |>
  plyranges::filter(source != "ENCODE_EpiRaction")
e
#> GRanges object with 106 ranges and 5 metadata columns:
#>         seqnames            ranges strand |         name      source_annotation
#>            <Rle>         <IRanges>  <Rle> |  <character>            <character>
#>     [1]     chr1 10001018-10001351      * | EH38E1317660 dELS: distal Enhance..
#>     [2]     chr1 10001823-10002161      * | EH38E3954141         TF: TF binding
#>     [3]     chr1 10002812-10003022      * | EH38E3954142 dELS: distal Enhance..
#>     [4]     chr1 10006462-10006688      * | EH38E3954143 CA: chromatin access..
#>     [5]     chr1 10007231-10007550      * | EH38E2785201 dELS: distal Enhance..
#>     ...      ...               ...    ... .          ...                    ...
#>   [102]     chr1 10090145-10090362      * | EH38E1317734 dELS: distal Enhance..
#>   [103]     chr1 10095440-10095595      * | EH38E1317735 dELS: distal Enhance..
#>   [104]     chr1 10096338-10096538      * | EH38E2785256 CA-CTCF: chromatin a..
#>   [105]     chr1 10098488-10098819      * | EH38E1317737 dELS: distal Enhance..
#>   [106]     chr1 10099598-10099944      * | EH38E3954159 CA: chromatin access..
#>                           type                source             source_url
#>                    <character>           <character>            <character>
#>     [1] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>     [2] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>     [3] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>     [4] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>     [5] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>     ...                    ...                   ...                    ...
#>   [102] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>   [103] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>   [104] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>   [105] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>   [106] candidate cis regula.. ENCODE_SCREEN (ccREs) https://www.encodepr..
#>   -------
#>   seqinfo: 1 sequence from hg38 genome; no seqlengths

tiles <- tile_ranges(rng, width=10e3) %>%
  plyranges::select(-partition) %>%
  plyranges::mutate(id = letters[seq_along(.)])

# count overlaps of central basepair of elements in tiles
e |>
  plyranges::anchor_center() |>
  plyranges::mutate(width = 1) |>
  plyranges::join_overlap_left(tiles) |>
  tibble::as_tibble() |>
  dplyr::count(id)
#> # A tibble: 10 × 2
#>    id        n
#>    <chr> <int>
#>  1 a         6
#>  2 b        13
#>  3 c        10
#>  4 d        15
#>  5 e        11
#>  6 f        15
#>  7 g        12
#>  8 h        12
#>  9 i         7
#> 10 j         5

Plot variants with plotgardener

Below we show a simple example of plotting variants and elements in a gene context, using the plotgardener package.

First selecting some variants around the gene GCK (reminder that we only obtain the first limit number of variants, see ?catalog_queries for more details).

# up to 200 variants:
v <- gene_variants(gene_name = "GCK", limit=200L, verbose=TRUE) |>
  dplyr::select(-c(gene, chr, source, source_url)) |>
  tidyr::unnest_wider(sequence_variant) |>
  dplyr::rename(seqnames = chr) |>
  dplyr::mutate(start = pos + 1, end = pos + 1) |>
  as_granges()

We then load plotgardener and define some default parameters.

library(plotgardener)
par <- pgParams(
  chrom = "chr7",
  chromstart = 44.1e6,
  chromend = 44.25e6,
  assembly = "hg38",
  just = c("left", "bottom")
)

Renaming some columns:

v_for_plot <- v |>
  plyranges::select(snp = rsid, p = log10pvalue, effect_size)

To match with the correct gene annotation, we could explicitly define the transcript database using plotgardener::assembly(), where we would provide a database built by running GenomicFeatures::makeTxDbFromGFF() on a GENCODE GTF file. Here we use one of the standard hg38 TxDb with UCSC-style chromosome names.

library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#> Loading required package: GenomicFeatures
#> Loading required package: AnnotationDbi
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'AnnotationDbi'
#> The following object is masked from 'package:plyranges':
#> 
#>     select
library(org.Hs.eg.db)
#>

Below we build the page and populate it with annotation from the TxDb used by plotgardener and from IGVF Catalog.

pageCreate(width = 5, height = 4, showGuides = FALSE)
plotGenes(
  params = par, x = 0.5, y = 3.5, width = 4, height = 1
)
#> genes[genes1]
plotGenomeLabel(
  params = par,
  x = 0.5, y = 3.5, length = 4,
  just = c("left", "top")
)
#> genomeLabel[genomeLabel1]
mplot <- plotManhattan(
  params = par, x = 0.5, y = 2.5, width = 4, height = 2,
  v_for_plot, trans = "",
  sigVal = -log10(5e-8), sigLine = TRUE, col = "grey", lty = 2
)
#> manhattan[manhattan1]
annoYaxis(
    plot = mplot, at=0:4 * 4, axisLine = TRUE, fontsize = 8
)
#> yaxis[yaxis1]
annoXaxis(
    plot = mplot, axisLine = TRUE, label = FALSE
)
#> xaxis[xaxis1]
plotText(
    params = par,
    label = "-log10(p-value)", x = 0.2, y = 2, rot = 90,
    fontsize = 8, fontface = "bold",
    default.units = "inches"
)

#> text[text1]