library(rvest)
library(xml2)
library(Rcrawler)Insurance Crawler
Load Libraries
Part 1: Get relevant Insurance Company Websites
Using Bing instead of Google, because Google seems to force using Google Api, which will cost money
Toy Example:
companies = data.frame(company = c("Allstate","Amica Mutual Insurance","Reliance Insurance Company"),
website_link=rep("",3),
stringsAsFactors = F)Search URL:
basic_search_url = "https://www.bing.com/search?q="For every company, fetch bing results and remove those with “b_adurl” Tag and those that lead to wikipedia.
for(i in 1:nrow(companies)){
company = companies[i,"company"]
print(paste0(company,"..."))
# company uses "insurance in its name
if(grepl(pattern = "insurance",x = company,ignore.case = T)){
search_url <- paste0(basic_search_url, '"',company,'"')
}
else{
search_url <- paste0(basic_search_url, '"',company,'"','+insurance')
}
# get html code for start page
pg<- read_html(x = URLencode(search_url))
links_start<-html_attr(html_nodes(pg,"g"),"href")
write_xml(pg, file="temp.html")
links <- html_nodes(pg, ".b_attribution")
is_ad <- unlist(lapply(links,FUN = function(x){
grepl(pattern = "b_adurl",x = x)
}))
no_ad_links = list()
count = 0
for( j in 1:length(links)){
if(is_ad[j]==FALSE){
# remove wikipedia
if(!grepl(pattern = "wikipedia",x = html_text(html_node(links[[j]],"cite")))){
count = count +1
no_ad_links[[count]] <- links[[j]]
}
}
}
# remove wikipedia result
# use first result
link <- no_ad_links[[1]]
link <- html_text(html_node(link,"cite"))
companies[i,"website_link"] = link
}[1] "Allstate..."
[1] "Amica Mutual Insurance..."
[1] "Reliance Insurance Company..."
companies company website_link
1 Allstate https://www.allstate.com
2 Amica Mutual Insurance https://www.amica.com/en.html
3 Reliance Insurance Company relianceins.com/reliance-homepage
Part 2: Crawl Websites
For every company homepage, get a list of all internal links. For every internal link: get all the content on the website and save it.
for (i in 1:nrow(companies)){
company <- companies[i,"company"]
website_link <- companies[i,"website_link"]
# get all internal links
all_internal_links <- LinkExtractor(url = website_link,ExternalLInks = F)$InternalLinks
all_internal_links <- c(website_link, all_internal_links)
contents <- list()
count=0
# get website texts of all internal pages
for(internal_link in all_internal_links){
count=count+1
print(paste0(count," of ",length(all_internal_links)))
content =""
try({
content <-ContentScraper(Url = as.character(internal_link),
XpathPatterns = "//html/body")
})
indiv_content = list()
indiv_content$company <- company
indiv_content$link <- internal_link
indiv_content$content <- content
contents[[count]]<-indiv_content
}
website_contents[[i]]<-contents
}Example Output:
website_contents[[3]][[3]]$company[1] "Reliance Insurance Company"
website_contents[[3]][[3]]$link[1] "relianceins.com/reliance-homepage/"
cat(
substr(
as.character(website_contents[[3]][[3]]$content), 500, 1000
)
)Home
About
General Insurance
About Company
Profile
Associated Companies
Bankers
Board Memebers
Management Team
Takaful (WTO)
About Company Takaful
Company Profile Takaful
Associated Companies
Bankers
Board Memebers
Management Team
Code of conduct
Compliance Certificate
Services
General Insurance
Fire Insurance
Marine Cargo
Motor Insurance
Miscellaneous Insurance
Engineering Insurance
Bond Guarantee Insurance
Terrorism Insurance
Aviation Insurance
Takaful (WTO)
Fire Takaful
Marine Cargo Takafu
Part 3: Apply Filters per Insurance Category
Toy Filters:
keywords_car_insurance <- c("car","auto")
keywords_motorcycle_insurance <- c("bike","motorcycle","motor cycle")
keywords_fire_insurance <- c("fire","burn")
keywords_boat_insurance <- c("boat")
keywords_terrorism_insurance <- c("Terrorism","terror")
keywords_identity_protection <- c("Identity Protection")
keywords_umbrella<-c("umbrella")Check presence of filter keywords per category and filter… either use just start website (approach 1) or use all crawled websites (approach 2)
filters <- list(keywords_car_insurance,
keywords_motorcycle_insurance,
keywords_fire_insurance,
keywords_boat_insurance,
keywords_terrorism_insurance,
keywords_identity_protection,
keywords_umbrella)
insurance_presence <- data.frame(car=rep(FALSE,nrow(companies)),
motorcycle=rep(FALSE,nrow(companies)),
fire=rep(FALSE,nrow(companies)),
boat=rep(FALSE,nrow(companies)),
terrorism = rep(FALSE,nrow(companies)),
identity_protection = rep(FALSE,nrow(companies)),
umbrella = rep(FALSE,nrow(companies))
)
rownames(insurance_presence)<-companies[,"company"]
# approach 1: just use main page
# apporach 2: use all web pages found
for (i in 1:nrow(companies)){
# approach 1
#text = as.character(website_contents[[i]][[1]]$content)
# approach 2
text = paste(unlist(lapply(website_contents[[i]],FUN = function(x){
as.character(x$content)
})),collapse = " ")
for (j in 1:length(filters)){
found = grepl(pattern = paste0(filters[[j]],collapse = "|"),x = text,ignore.case = T)
if(isTRUE(found)){
insurance_presence[i,j]<-TRUE
}
}
}
DT::datatable(insurance_presence, options = list(dom="t"),class = 'cell-border stripe row-border compact')%>% DT::formatStyle(
columns = colnames(insurance_presence),
target = 'cell',
backgroundColor = DT::styleEqual(c(0, 1), c('#f0827a', '#72f290')),
fontWeight = DT::styleEqual(c(0,1),c("normal","bold"))
)Of course all these parts need to be further specified…
Part 4 Evaluation Metrics
…