GitHub - ramiromagno/hgnc: Download and import HGNC gene data into R (original) (raw)

The HGNC approves a unique and meaningful name for every known human gene, based on a group of experts. In addition, the HGNC also provides the mapping between gene symbols to gene entries in other popular databases or resources: the HGNC complete gene set.

The goal of {hgnc} is to easily download and import the latest HGNC complete gene data set into R.

This data set provides a useful mapping of HGNC symbols to gene entries in other popular databases or resources, such as, the Entrez gene identifier or the UCSC gene identifier, among many others. Check the documentation of the function import_hgnc_dataset() for a description of the several fields available.

install.packages("remotes")

remotes::install_github("ramiromagno/hgnc")

To import the latest HGNC gene data set in tabular format directly into memory as a tibble do as follows:

library(hgnc)

Date of HGNC last update

last_update() #> [1] "2023-10-30 03:31:41 UTC"

Set the HGNC archive file to use for the remainder of the R-session

use_hgnc_file(file = latest_archive_url()) #> using hgnc file: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt

Import the data set in tidy tabular format

NB: Multiple-value columns are kept as list-columns

hgnc_dataset <- import_hgnc_dataset()

dplyr::glimpse(hgnc_dataset) #> Rows: 43,736 #> Columns: 55 #> $ hgnc_id "HGNC:5", "HGNC:37133", "HGNC:24086", "HGNC:7… #> $ hgnc_id2 5, 37133, 24086, 7, 27057, 23336, 41022, 4152… #> $ symbol "A1BG", "A1BG-AS1", "A1CF", "A2M", "A2M-AS1",… #> $ name "alpha-1-B glycoprotein", "A1BG antisense RNA… #> $ locus_group "protein-coding gene", "non-coding RNA", "pro… #> $ locus_type "gene with protein product", "RNA, long non-c… #> $ status "Approved", "Approved", "Approved", "Approved… #> $ location "19q13.43", "19q13.43", "10q11.23", "12p13.31… #> $ location_sortable "19q13.43", "19q13.43", "10q11.23", "12p13.31… #> $ alias_symbol NA, "FLJ23569", <"ACF", "ASP", "ACF64", "ACF… #> $ alias_name NA, NA, NA, NA, NA, NA, NA, NA, NA, <"iGb3 s… #> $ prev_symbol NA, <"NCRNA00181", "A1BGAS", "A1BG-AS">, NA,… #> $ prev_name NA, <"non-protein coding RNA 181", "A1BG ant… #> $ gene_group "Immunoglobulin like domain containing", "An… #> $ gene_group_id "594", "1987", "725", "2148", "1987", "2148"… #> $ date_approved_reserved 1989-06-30, 2009-07-20, 2007-11-23, 1986-01-… #> $ date_symbol_changed NA, 2010-11-25, NA, NA, NA, 2005-09-01, NA, … #> $ date_name_changed NA, 2012-08-15, NA, NA, 2018-03-21, 2016-03-… #> $ date_modified 2023-01-20, 2013-06-27, 2023-01-20, 2023-01-… #> $ entrez_id 1, 503538, 29974, 2, 144571, 144568, 10087410… #> $ ensembl_gene_id "ENSG00000121410", "ENSG00000268895", "ENSG00… #> $ vega_id "OTTHUMG00000183507", "OTTHUMG00000183508", "… #> $ ucsc_id "uc002qsd.5", "uc002qse.3", "uc057tgv.1", "uc… #> $ ena NA, "BC040926", "AF271790", <"BX647329", "X6… #> $ refseq_accession "NM_130786", "NR_015380", "NM_014576", "NM_0… #> $ ccds_id "CCDS12976", NA, <"CCDS7242", "CCDS7241", "C… #> $ uniprot_ids "P04217", NA, "Q9NQ94", "P01023", NA, "A8K2U… #> $ pubmed_id "2591067", NA, <"11815617", "11072063">, <"2… #> $ mgd_id "MGI:2152878", NA, "MGI:1917115", "MGI:24491… #> $ rgd_id "RGD:69417", NA, "RGD:619834", "RGD:2004", N… #> $ lsdb NA, NA, NA, "LRG_591|http://ftp.ebi.ac.uk/pub… #> $ cosmic NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ omim_id "138670", NA, "618199", "103950", NA, "61062… #> $ mirbase NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ homeodb NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ snornabase NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ bioparadigms_slc NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ orphanet NA, NA, NA, NA, NA, "410627", NA, NA, NA, NA,… #> $ pseudogene.org NA, NA, NA, NA, NA, NA, NA, NA, "PGOHUM000002… #> $ horde_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ merops "I43.950", NA, NA, "I39.001", NA, "I39.007", … #> $ imgt NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ iuphar NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ kznf_gene_catalog NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ mamit-trnadb NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ cd NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ lncrnadb NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ enzyme_id NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2.4… #> $ intermediate_filament_db NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ rna_central_ids NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ lncipedia NA, "A1BG-AS1", NA, NA, "A2M-AS1", NA, "A2ML1… #> $ gtrnadb NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N… #> $ agr "HGNC:5", "HGNC:37133", "HGNC:24086", "HGNC:7… #> $ mane_select <"ENST00000263100.8", "NM_130786.4">, NA, <"… #> $ gencc NA, NA, NA, "HGNC:7", NA, "HGNC:23336", NA, N…

The original data set does not contain the column hgnc_id2, which is added as a convenience by {hgnc}; this is because although the HGNC identifiers should formally contain the prefix "HGNC:", it is often found elsewhere that they are stripped of this prefix, so the columnhgnc_id2 is also provided whose values only contain the integer part.

hgnc_dataset %>% dplyr::select(c('hgnc_id', 'hgnc_id2')) #> # A tibble: 43,736 × 2 #> hgnc_id hgnc_id2 #> #> 1 HGNC:5 5 #> 2 HGNC:37133 37133 #> 3 HGNC:24086 24086 #> 4 HGNC:7 7 #> 5 HGNC:27057 27057 #> 6 HGNC:23336 23336 #> 7 HGNC:41022 41022 #> 8 HGNC:41523 41523 #> 9 HGNC:8 8 #> 10 HGNC:30005 30005 #> # ℹ 43,726 more rows

The HGNC defines a group name (locus_group) for a set of related locus types. Here’s how you can quickly check how many gene entries there are per locus group.

hgnc_dataset %>% dplyr::count(locus_group, sort = TRUE) #> # A tibble: 4 × 2 #> locus_group n #> #> 1 protein-coding gene 19278 #> 2 pseudogene 14376 #> 3 non-coding RNA 9091 #> 4 other 991

hgnc_dataset %>% dplyr::group_by(locus_group) %>% dplyr::count(locus_type, sort = TRUE) %>% dplyr::arrange(locus_group) %>% print(n = Inf) #> # A tibble: 23 × 3 #> # Groups: locus_group [4] #> locus_group locus_type n #> #> 1 non-coding RNA RNA, long non-coding 5754 #> 2 non-coding RNA RNA, micro 1912 #> 3 non-coding RNA RNA, transfer 591 #> 4 non-coding RNA RNA, small nucleolar 568 #> 5 non-coding RNA RNA, cluster 119 #> 6 non-coding RNA RNA, ribosomal 60 #> 7 non-coding RNA RNA, small nuclear 50 #> 8 non-coding RNA RNA, misc 29 #> 9 non-coding RNA RNA, Y 4 #> 10 non-coding RNA RNA, vault 4 #> 11 other immunoglobulin gene 230 #> 12 other T cell receptor gene 206 #> 13 other readthrough 147 #> 14 other fragile site 116 #> 15 other endogenous retrovirus 109 #> 16 other complex locus constituent 69 #> 17 other unknown 68 #> 18 other region 38 #> 19 other virus integration site 8 #> 20 protein-coding gene gene with protein product 19278 #> 21 pseudogene pseudogene 14136 #> 22 pseudogene immunoglobulin pseudogene 203 #> 23 pseudogene T cell receptor pseudogene 37

By default {hgnc} will use the latest version of HGNC archive dataset, as returned by the function latest_archive_url(). Besides the latest archive, the HUGO Gene Nomenclature Committee (HGNC) website also provides monthly and quarterly updates. You can conveniently get the latest monthly and quarterly updates by running latest_monthly_url() orlatest_quarterly_url(). Use list_archives() to list the all currently available for download archives. The column url contains the direct download link that you can pass to use_hgnc_file() andimport_hgnc_dataset() to import the data into R.

list_archives() #> # A tibble: 182 × 7 #> series dataset file date size last_modified url
#> #> 1 monthly hgnc_complete_set hgnc_co… 2021-03-01 14M 2023-05-01 00:05:00 http… #> 2 monthly hgnc_complete_set hgnc_co… 2021-04-01 15M 2023-05-01 00:05:00 http… #> 3 monthly hgnc_complete_set hgnc_co… 2021-05-01 15M 2023-05-01 00:05:00 http… #> 4 monthly hgnc_complete_set hgnc_co… 2021-06-01 15M 2023-05-01 00:05:00 http… #> 5 monthly hgnc_complete_set hgnc_co… 2021-07-01 15M 2023-05-01 00:05:00 http… #> 6 monthly hgnc_complete_set hgnc_co… 2021-08-01 15M 2023-05-01 00:05:00 http… #> 7 monthly hgnc_complete_set hgnc_co… 2021-09-01 15M 2023-05-01 00:05:00 http… #> 8 monthly hgnc_complete_set hgnc_co… 2021-10-01 15M 2023-05-01 00:05:00 http… #> 9 monthly hgnc_complete_set hgnc_co… 2021-11-01 15M 2023-05-01 00:05:00 http… #> 10 monthly hgnc_complete_set hgnc_co… 2021-12-01 15M 2023-05-01 00:05:00 http… #> # ℹ 172 more rows

use_hgnc_file(file = latest_monthly_url()) #> using hgnc file: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-10-01.txt

If you prefer to download the data set as a file to disk first, you can use download_archive(). Then, you can use import_hgnc_dataset() to import the downloaded file into R.

Two convenience functions are provided to convert between different gene identifiers in the HGNC dataset: hgnc_join takes as input a data frame and adds an additional column from the HGNC dataset, and hgnc_convertconverts a vector from one identifier to another.

dplyr::tibble(hgnc_id = c("HGNC:44948", "HGNC:43240", "HGNC:23357", "HGNC:1855", "HGNC:39400")) %>%

add in gene symbol

hgnc_join(by = 'hgnc_id', column = 'symbol') %>%

add in entrez_id

hgnc_join(by = 'hgnc_id', column = 'entrez_id') #> # A tibble: 5 × 3 #> hgnc_id symbol entrez_id #> #> 1 HGNC:44948 GOLGA2P6 729668 #> 2 HGNC:43240 RNA5SP340 100873603 #> 3 HGNC:23357 MCTS1 28985 #> 4 HGNC:1855 CENPCP1 1061 #> 5 HGNC:39400 ASNSP4 100419423

convert a set of hgnc_ids to symbols

hgnc_ids <- c("HGNC:44948", "HGNC:43240", "HGNC:23357", "HGNC:1855", "HGNC:39400") hgnc_convert(hgnc_ids, from = 'hgnc_id', to = 'symbol') #> [1] "GOLGA2P6" "RNA5SP340" "MCTS1" "CENPCP1" "ASNSP4"

convert a set of entrez_ids to ensembl_gene_ids

entrez_ids <- c(79933, 109623458, 158471, 54987, 81631) hgnc_convert(entrez_ids, from = 'entrez_id', to = 'ensembl_gene_id') #> [1] "ENSG00000166317" NA "ENSG00000106772" "ENSG00000162384" #> [5] "ENSG00000140941"

By default {hgnc} will use memory-based caching through the package{memoise}. This will ensure that time consuming downloads are not rerun unnecessarily during a single R-session. Persistent disk based caching can also be enabled using the function use_cache_dir.

use_cache_dir(cache_dir = './hgnc_cache') #> using hgnc cache dir: ./hgnc_cache

You could go to www.genenames.org and download the files yourself. So why the need for this R package?

{hgnc} really is just a convenience package. The main advantage is that the function import_hgnc_dataset() reads in the data in tabular format with all the columns with the appropriate type (so you don’t have to specify it yourself). As an extra step, those variables that contain multiple values are encoded as list-columns.

Remember that list-columns can be expanded with tidyr::unnest(). E.g.,alias_symbol is a list-column containing multiple alternative aliases to the standard symbol:

hgnc_dataset %>% dplyr::filter(symbol == 'TP53') %>% dplyr::select(c('symbol', 'alias_symbol')) #> # A tibble: 1 × 2 #> symbol alias_symbol #>
#> 1 TP53 <chr [2]>

hgnc_dataset %>% dplyr::filter(symbol == 'TP53') %>% dplyr::select(c('symbol', 'alias_symbol')) %>% tidyr::unnest(cols = 'alias_symbol') #> # A tibble: 2 × 2 #> symbol alias_symbol #>
#> 1 TP53 p53
#> 2 TP53 LFS1

In addition, we also provide the function filter_by_keyword() that allows filtering the data set based on a keyword or regular expression. By default this function will look into all columns that contain gene symbols or names (symbol, name, alias_symbol, alias_name,prev_symbol and prev_name). It works automatically with list-columns too.

hgnc_dataset %>% filter_by_keyword('TP53') %>% dplyr::select(1:4) #> # A tibble: 66 × 4 #> hgnc_id hgnc_id2 symbol name
#>
#> 1 HGNC:49685 49685 ABHD15-AS1 ABHD15 antisense RNA 1
#> 2 HGNC:20679 20679 ANO9 anoctamin 9
#> 3 HGNC:40093 40093 BCAR3-AS1 BCAR3 antisense RNA 1
#> 4 HGNC:13276 13276 EI24 EI24 autophagy associated transmembrane prot… #> 5 HGNC:3345 3345 ENC1 ectodermal-neural cortex 1
#> 6 HGNC:27919 27919 ERVMER61-1 endogenous retrovirus group MER61 member 1
#> 7 HGNC:56226 56226 FAM169A-AS1 FAM169A antisense RNA 1
#> 8 HGNC:4136 4136 GAMT guanidinoacetate N-methyltransferase
#> 9 HGNC:54868 54868 KLRK1-AS1 KLRK1 antisense RNA 1
#> 10 HGNC:6568 6568 LGALS7 galectin 7
#> # ℹ 56 more rows

hgnc_dataset %>% filter_by_keyword('TP53', cols = 'symbol') %>% dplyr::select(1:4) #> # A tibble: 23 × 4 #> hgnc_id hgnc_id2 symbol name
#>
#> 1 HGNC:11998 11998 TP53 tumor protein p53
#> 2 HGNC:29984 29984 TP53AIP1 tumor protein p53 regulated apoptosis inducing… #> 3 HGNC:11999 11999 TP53BP1 tumor protein p53 binding protein 1
#> 4 HGNC:12000 12000 TP53BP2 tumor protein p53 binding protein 2
#> 5 HGNC:16328 16328 TP53BP2P1 tumor protein p53 binding protein 2 pseudogene… #> 6 HGNC:43652 43652 TP53COR1 tumor protein p53 pathway corepressor 1
#> 7 HGNC:19373 19373 TP53I3 tumor protein p53 inducible protein 3
#> 8 HGNC:16842 16842 TP53I11 tumor protein p53 inducible protein 11
#> 9 HGNC:25102 25102 TP53I13 tumor protein p53 inducible protein 13
#> 10 HGNC:18022 18022 TP53INP1 tumor protein p53 inducible nuclear protein 1
#> # ℹ 13 more rows

Search for the whole word "TP53" exactly by taking advantage of regular expressions:

hgnc_dataset %>% filter_by_keyword('^TP53$', cols = 'symbol') %>% dplyr::select(1:4) #> # A tibble: 1 × 4 #> hgnc_id hgnc_id2 symbol name
#>
#> 1 HGNC:11998 11998 TP53 tumor protein p53

Please include the month and year you retrieved the data cited.