Insurance Crawler

Load Libraries

library(rvest)
library(xml2)
library(Rcrawler)

Part 1: Get relevant Insurance Company Websites

Using Bing instead of Google, because Google seems to force using Google Api, which will cost money

Toy Example:

companies = data.frame(company = c("Allstate","Amica Mutual Insurance","Reliance Insurance Company"),
                       website_link=rep("",3),
                       stringsAsFactors = F)

Search URL:

basic_search_url = "https://www.bing.com/search?q="

For every company, fetch bing results and remove those with “b_adurl” Tag and those that lead to wikipedia.

for(i in 1:nrow(companies)){
  company = companies[i,"company"]
  print(paste0(company,"..."))
  # company uses "insurance in its name
  if(grepl(pattern = "insurance",x = company,ignore.case = T)){
    search_url <- paste0(basic_search_url, '"',company,'"')
  }
  else{
    search_url <- paste0(basic_search_url, '"',company,'"','+insurance')
  }
  
  # get html code for start page
  pg<- read_html(x = URLencode(search_url))
  links_start<-html_attr(html_nodes(pg,"g"),"href")
  write_xml(pg, file="temp.html")
  
  links <- html_nodes(pg, ".b_attribution")
  is_ad <- unlist(lapply(links,FUN = function(x){
    grepl(pattern = "b_adurl",x = x)
  }))
  no_ad_links = list()
  count = 0
  for( j in 1:length(links)){
    if(is_ad[j]==FALSE){
      # remove wikipedia
      if(!grepl(pattern = "wikipedia",x = html_text(html_node(links[[j]],"cite")))){
        count = count +1 
        no_ad_links[[count]] <- links[[j]]
      }
    }
  }
  # remove wikipedia result
  
  # use first result
  link <- no_ad_links[[1]]
  link <- html_text(html_node(link,"cite"))
  companies[i,"website_link"] = link
}
[1] "Allstate..."
[1] "Amica Mutual Insurance..."
[1] "Reliance Insurance Company..."
companies
                     company                      website_link
1                   Allstate          https://www.allstate.com
2     Amica Mutual Insurance     https://www.amica.com/en.html
3 Reliance Insurance Company relianceins.com/reliance-homepage

Part 2: Crawl Websites

For every company homepage, get a list of all internal links. For every internal link: get all the content on the website and save it.

for (i in 1:nrow(companies)){
  company <- companies[i,"company"]
  website_link <- companies[i,"website_link"]
  # get all internal links
  all_internal_links <- LinkExtractor(url = website_link,ExternalLInks = F)$InternalLinks
  all_internal_links <- c(website_link, all_internal_links)
  contents <- list()
  count=0
  # get website texts of all internal pages
  for(internal_link in all_internal_links){
    count=count+1
    print(paste0(count," of ",length(all_internal_links)))
    content =""
    try({
      content <-ContentScraper(Url = as.character(internal_link),
                               XpathPatterns = "//html/body")
    })
    indiv_content = list()
    indiv_content$company <- company
    indiv_content$link <- internal_link
    indiv_content$content <- content
    contents[[count]]<-indiv_content
  }
  website_contents[[i]]<-contents
}

Example Output:

website_contents[[3]][[3]]$company
[1] "Reliance Insurance Company"
website_contents[[3]][[3]]$link
[1] "relianceins.com/reliance-homepage/"
cat(
  substr(
    as.character(website_contents[[3]][[3]]$content), 500, 1000
  )
)
Home
About
General Insurance
About Company
Profile
Associated Companies
Bankers
Board Memebers
Management Team

Takaful (WTO)
About Company Takaful
Company Profile Takaful
Associated Companies
Bankers
Board Memebers
Management Team

Code of conduct
Compliance Certificate

Services
General Insurance
Fire Insurance
Marine Cargo
Motor Insurance
Miscellaneous Insurance
Engineering Insurance
Bond Guarantee Insurance
Terrorism Insurance
Aviation Insurance

Takaful (WTO)
Fire Takaful
Marine Cargo Takafu

Part 3: Apply Filters per Insurance Category

Toy Filters:

keywords_car_insurance <- c("car","auto")
keywords_motorcycle_insurance <- c("bike","motorcycle","motor cycle")
keywords_fire_insurance <- c("fire","burn")
keywords_boat_insurance <- c("boat")
keywords_terrorism_insurance <- c("Terrorism","terror")
keywords_identity_protection <- c("Identity Protection")
keywords_umbrella<-c("umbrella")

Check presence of filter keywords per category and filter… either use just start website (approach 1) or use all crawled websites (approach 2)

filters <- list(keywords_car_insurance,
                keywords_motorcycle_insurance, 
                keywords_fire_insurance,
                keywords_boat_insurance,
                keywords_terrorism_insurance,
                keywords_identity_protection,
                keywords_umbrella)

insurance_presence <- data.frame(car=rep(FALSE,nrow(companies)),
                                 motorcycle=rep(FALSE,nrow(companies)),
                                 fire=rep(FALSE,nrow(companies)),
                                 boat=rep(FALSE,nrow(companies)),
                                 terrorism = rep(FALSE,nrow(companies)),
                                 identity_protection = rep(FALSE,nrow(companies)),
                                 umbrella = rep(FALSE,nrow(companies))
)
rownames(insurance_presence)<-companies[,"company"]

# approach 1: just use main page
# apporach 2: use all web pages found

for (i in 1:nrow(companies)){
  # approach 1
  #text = as.character(website_contents[[i]][[1]]$content)
  # approach 2
  text = paste(unlist(lapply(website_contents[[i]],FUN = function(x){
    as.character(x$content)
  })),collapse = " ")
  for (j in 1:length(filters)){
    found = grepl(pattern = paste0(filters[[j]],collapse = "|"),x = text,ignore.case = T)
    if(isTRUE(found)){
      insurance_presence[i,j]<-TRUE
    }
  }
}
DT::datatable(insurance_presence, options = list(dom="t"),class = 'cell-border stripe row-border compact')%>% DT::formatStyle(
  columns = colnames(insurance_presence),
  target = 'cell',
  backgroundColor = DT::styleEqual(c(0, 1), c('#f0827a', '#72f290')),
  fontWeight = DT::styleEqual(c(0,1),c("normal","bold"))                                
)

Of course all these parts need to be further specified…

Part 4 Evaluation Metrics