R text mining

Requirement Description

Environment Settings and load package for text mining

First time, you should install required packages for following steps, once installed you can use “#” to inactive install function as annotation format

#options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")

#install.packages()
#source("https://bioconductor.org/biocLite.R")
#biocLite("rentrez")
#biocLite("pubmed.mineR")
#biocLite("ggplot2")
#install.packages("jiebaR")
library(jiebaR)
library(rentrez)
library(pubmed.mineR)
library(bibliometrix) # http://bibliometrix.org/documents/bibliometrix_Report.html
library(ggplot2)
library(ggrepel)
library(ggthemes)
library(DT)
library(htmlwidgets)

Sys.setenv(LANGUAGE = "en") # show error 
options(stringsAsFactors = FALSE) # forbidding character to factor

Firstly, let’s start by opening an R session and creating a new folder for this text mining case study.

# words <- read.delim(pipe("pbpaste"))
# remove all the data/function
rm(list=ls())

# create a folder
if ( ! file.exists('~/Desktop/text.mining'))
  dir.create('~/Desktop/text.mining')

# set work direction
setwd("~/Desktop/text.mining")
# Total human Gene Symbol Set
# have to copy common_words_new.rda and HGNCdata.rda to your created text.mining folder before you load it!!!
load("common_words_new.rda") 
load("HGNCdata.rda")

Building search terms

Secondly, use rentrez package entrez_search function for searching your interested keyword and term

The EUtils API uses a special syntax to build search terms. We can search a database against a specific term using the format query[SEARCH FIELD], and combine multiple such searches using the boolean operators AND, OR and NOT.

For instance, we can find next generation sequence datasets for the (amazing…) ciliate Tetrahymena thermophila by using the organism (‘ORGN’) search field:

entrez_search(db="sra",term="Tetrahymena thermophila[ORGN] AND 2013:2015[PDAT]",retmax=0)

Here, we try to use NCBI MeSH for searching Medical Subject Headings(MeSH) refer to wiki https://en.wikipedia.org/wiki/Medical_Subject_Headings using the format query[SEARCH FIELD], and combine multiple such searches using operators AND, OR and NOT.

  1. We use (Nasopharyngeal Neoplasms) AND [(Whole Exome Sequencing) OR (High-Throughput Nucleotide Sequencing)] to search all the published papers in pubmed.
  2. We use (Nasopharyngeal Neoplasms) AND [(Epstein-Barr Virus Infections) OR (Papillomaviridae)] to search all the published papers in pubmed
#######################################################################
# [All Fields] for keywords
# [ORGN] for organism
# [MeSH] for Medical Subject Headings
# [PDAT] for years
#######################################################################

#1. We use (Nasopharyngeal Neoplasms) AND [(Whole Exome Sequencing) OR (High-Throughput Nucleotide Sequencing)] to search all the published papers in pubmed.
#Search.v1<- entrez_search(db="pubmed", term="(Nasopharyngeal[All Fields]) AND ((Whole Exome Sequencing[MeSH]) OR (High-Throughput Nucleotide Sequencing[MeSH]))", retmode = "xml", retmax=20000)

#2. We use (Nasopharyngeal Neoplasms) AND [(Epstein-Barr Virus Infections) OR (Papillomaviridae)] to search all the published papers in pubmed
Search.v2<- entrez_search(db="pubmed", term="(Nasopharyngeal[All Fields]) AND ((Epstein-Barr Virus Infections[MeSH]) OR (Papillomaviridae[MeSH]))", retmode = "xml", retmax=20000)

# download all the paper info
Search<- Search.v2
res <- c()
for (i in seq(1,length(Search$ids),200)) {
  multi_summ <- entrez_summary(db="pubmed",id=Search$ids[i:(i+199)])
  #extract_from_esummary(multi_summ,"fulljournalname")
  #extract_from_esummary(multi_summ,"pubdate")
  date_and_cite<- extract_from_esummary(multi_summ,c("uid","pubdate","pmcrefcount", "title", "fulljournalname"))
  #tmp <- knitr::kable(head(t(date_and_cite)),row.names=FALSE)
  res1<- data.frame(t(date_and_cite))
  res1<- data.frame(lapply(res1, as.character), stringsAsFactors=FALSE)
  res<- rbind(res,res1)
}

res[is.na(as.numeric(res$pmcrefcount)),"pmcrefcount"]=0
res$pmcrefcount <- as.numeric(res$pmcrefcount)

datatable(as.data.frame(lapply(res, function(x) if(class(x)=="numeric") round(x,4) else x), check.names=FALSE, row.names=row.names(res)))

As you can see above, the object returned by entrez_search() includes the number of records matching a given search. This means you can learn a little about the composition of, or trends in, the records stored in the NCBI’s databases using only the search utility. For instance, let’s track the rise of the scientific buzzword “connectome” in PubMed, programmatically creating search terms for the PDAT field:

search_year <- function(year, term){
    query <- paste(term, "AND (", year, "[PDAT])")
    entrez_search(db="pubmed", term=query, retmax=0)$count
}

year <- 1980:2018
papers <- sapply(year, search_year, term="(Nasopharyngeal[All Fields]) AND ((Epstein-Barr Virus Infections[MeSH]) OR (Papillomaviridae[MeSH]))", USE.NAMES=FALSE)

plot(year, papers, type='b', main="The Rise of the Nasopharyngeal paper")

We also can extract abstract info for counting which and how hot gene involved in(under those keywords) (Nasopharyngeal[All Fields]) AND ((Epstein-Barr Virus Infections[MeSH]) OR (Papillomaviridae[MeSH]))

abstract <- c()
for (i in seq(1,length(Search$ids),200)) {
  rec <- parse_pubmed_xml(entrez_fetch(db="pubmed", id=Search$ids[i:(i+199)], rettype="xml"))
  abstract1<- as.data.frame(do.call('rbind',rec))
  abstract<- rbind(abstract,abstract1)
}

# Search abstract
#length(abstract1$abstract) # how many abstract
#abstract1$abstract[[70]] # which one you are looking for

abstract<- unlist(abstract$abstract)

tempa = unlist(strsplit(abstract, ",",fixed = T));
tempb = unlist(strsplit(tempa, ":",fixed = T));
tempc = unlist(strsplit(tempb, ";",fixed = T));
tempd = unlist(strsplit(tempc, "'",fixed = T));
tempe = unlist(strsplit(tempd, " ",fixed = T));
tempf = unlist(strsplit(tempe, "/",fixed = T));
tempf = unlist(strsplit(tempe, "\\|",fixed = T));
tempf = unlist(strsplit(tempf, "(",fixed = T));
tempf = unlist(strsplit(tempf, ")",fixed = T));
tempf = unlist(strsplit(tempf, "-",fixed = T));
tempf = unlist(strsplit(tempf, ".",fixed = T));
tempf = toupper(tempf)
tempi = as.data.frame(table(tempf));
tempj = unlist(lapply(toupper(common_words_new), function(x){tempoo = which(as.character(tempi[,1]) == x   ); if (length(tempoo) != 0) return(tempoo)}));
tempk = tempi[-tempj,];
templ = as.character(HGNCdata$Approved.Symbol);
tempm = unlist(lapply(templ,function(x){return(which(x == as.character(tempk$tempf) ))}));
tempn = tempk[tempm,]
tempn2=tempn[order(as.numeric(tempn$Freq), decreasing = T),]
tempo = unlist(lapply( as.character(tempn2$tempf), function(x){return(which(x == templ))}));
Genes = as.character(HGNCdata$Approved.Name[tempo]);
data_table= cbind.data.frame(Symbol=as.character(tempn2$tempf), Genes, Freq=as.numeric(tempn$Freq));
colnames(data_table)=c("Gene_symbol", "Genes", "Freq");
datatable(as.data.frame(lapply(data_table, function(x) if(class(x)=="numeric") round(x,4) else x), check.names=FALSE, row.names=row.names(data_table)))

Enjoy it! Hope can help you speed up your research!!!

sessionInfo()