Dec 6, 2016

Explore more posts
by Tags | by Date | View All
Keep up to date
  • Mapping Snps And Peaks To Genes In R


    Mapping SNPs and peaks to genes in R

    We are often interested in mapping mutations or SNPs to genes, or peaks to genes, or genes to regions of copy number alteration, etc. The general computational problem is quite similar for all these cases: we have two sets of genomic regions that we seek to overlap. This can be accomplished very quickly in R using GenomicRanges. To learn more about GenomicRanges, consult Bioconductor.

    In this particular tutorial, I want to map a set of SNPs to genes.

    # Sample snps
    snps <- c("1:11873", "1:69100", "1:752761")
    
    # Sample genes from GTF
    gtfFile <- 'Homo_sapiens.GRCh37.75.gtf'
    gtf <- read.table(gtfFile, header=F, stringsAsFactors=F, sep='\t', nrows=1000) # limit number of rows for testing
    gtf.gene <- gtf[gtf[,3]=="gene", c(1,4,5)]
    gene.names <- unlist(lapply(gtf[gtf[,3]=="gene", 9], function(x) {
        y <- strsplit(x, ';')[[1]][2]
        gsub(' gene_name ', '', y)
    }))
    rownames(gtf.gene) <- gene.names
    head(gtf.gene)
    
    ##            V1    V4    V5
    ## DDX11L1     1 11869 14412
    ## WASH7P      1 14363 29806
    ## MIR1302-10  1 29554 31109
    ## FAM138A     1 34554 36081
    ## OR4G4P      1 52473 54936
    ## OR4G11P     1 62948 63887
    

    Let’s define a few helper functions to help out.

    # Define a few helper functions
    
    #' Convert from string to range
    #' 
    #' @param pos A vector of strings ex. chr1 2938302 2938329
    #' @param delim Delimiter for string splitting
    #' @param region Boolean of whether region or just one position
    #'
    #' @returns Dataframe of ranges
    #' 
    string2range <- function(pos, delim=' ', region=TRUE) {
        posp <- as.data.frame(do.call(rbind, strsplit(pos, delim)))
        posp[,1] <- posp[,1]
    	posp[,2] <- as.numeric(as.character(posp[,2]))
    	if(region) {
            posp[,3] <- as.numeric(as.character(posp[,3]))
    	} else {
    	    posp[,3] <- posp[,2]
    	}
        return(posp)
    }
    
    #' Convert from ranges to GRanges
    #' 
    #' @param df Dataframe with columns as sequence name, start, and end
    #' 
    #' @returns GRanges version 
    #' 
    range2GRanges <- function(df) {
        require(GenomicRanges)
        require(IRanges)
    	gr <- GenomicRanges::GRanges(
            seqnames = df[,1],
            ranges=IRanges(start = df[,2], end = df[,3])
            )
        return(gr)
    }
    

    Now let’s convert our SNPs to GRanges.

    # convert SNPs to GRanges
    snps.ranges <- string2range(snps, delim=":", region=FALSE)
    head(snps.ranges)
    
    ##   V1     V2     V3
    ## 1  1  11873  11873
    ## 2  1  69100  69100
    ## 3  1 752761 752761
    
    snps.granges <- range2GRanges(snps.ranges)
    names(snps.granges) <- snps
    head(snps.granges)
    
    ## GRanges object with 3 ranges and 0 metadata columns:
    ##            seqnames           ranges strand
    ##               <Rle>        <IRanges>  <Rle>
    ##    1:11873        1 [ 11873,  11873]      *
    ##    1:69100        1 [ 69100,  69100]      *
    ##   1:752761        1 [752761, 752761]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    
    # convert genes to GRanges
    gtf.granges <- range2GRanges(gtf.gene)
    names(gtf.granges) <- gene.names
    head(gtf.granges)
    
    ## GRanges object with 6 ranges and 0 metadata columns:
    ##              seqnames         ranges strand
    ##                 <Rle>      <IRanges>  <Rle>
    ##      DDX11L1        1 [11869, 14412]      *
    ##       WASH7P        1 [14363, 29806]      *
    ##   MIR1302-10        1 [29554, 31109]      *
    ##      FAM138A        1 [34554, 36081]      *
    ##       OR4G4P        1 [52473, 54936]      *
    ##      OR4G11P        1 [62948, 63887]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    

    Now that we have our two GRanges objects, we can easily overlap them using GenomicRanges::findOverlaps.

    r1 <- snps.granges
    r2 <- gtf.granges
    overlap <- GenomicRanges::findOverlaps(r1, r2)
    # make vector of SNPs to genes
    hits <- names(r2)[slot(overlap, "subjectHits")]
    names(hits) <- names(r1)[slot(overlap, "queryHits")]
    hits
    
    ##          1:11873          1:69100         1:752761         1:752761 
    ##        "DDX11L1"          "OR4F5" "RP11-206L10.10"         "FAM87B"
    

    We can double check manually that indeed our SNPs fall within these identified genes.

    r1[names(hits),]
    
    ## GRanges object with 4 ranges and 0 metadata columns:
    ##            seqnames           ranges strand
    ##               <Rle>        <IRanges>  <Rle>
    ##    1:11873        1 [ 11873,  11873]      *
    ##    1:69100        1 [ 69100,  69100]      *
    ##   1:752761        1 [752761, 752761]      *
    ##   1:752761        1 [752761, 752761]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
    
    r2[hits,]
    
    ## GRanges object with 4 ranges and 0 metadata columns:
    ##                  seqnames           ranges strand
    ##                     <Rle>        <IRanges>  <Rle>
    ##          DDX11L1        1 [ 11869,  14412]      *
    ##            OR4F5        1 [ 69091,  70008]      *
    ##   RP11-206L10.10        1 [745489, 753092]      *
    ##           FAM87B        1 [752751, 755214]      *
    ##   -------
    ##   seqinfo: 1 sequence from an unspecified genome; no seqlengths