Monday, July 3, 2017

Indeed Scrapper

Indeed is a Job Posting, Employee Comments and Reviews Social Forum. One can understand the sentiment of Employer by reading the comments.

So now a days, many companies wants to integrate their internal data warehouse with external data sets. Below is the Indeed Python Scrapper, that crawls the comments, ratings etc. and dump the output in csv file.

# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, lxml, csv, sys
import pandas as pd

#http://www.indeed.com/cmp/Google/reviews?fcountry=US

df = pd.DataFrame()  # create a new data frame
host = 'http://www.indeed.com'
cmp_tag = '/cmp/'
rev = '/reviews'
countryfilter = '?fcountry=US'
isFlag = True
with open('companies.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
                isFlag = True
  try:
   company_name = row[0]
    
   # complete URL
   url = '%s%s%s%s%s' % (host, cmp_tag, company_name, rev, countryfilter)
                        print "%s" %(url)
   target = Soup(urllib.urlopen(url), 'lxml')                          
   targetElements = target.findAll('div', attrs={'class': 'cmp-review-container'})
   
   for elem in targetElements:
    cmp_review = elem.find('div', attrs={'class': 'cmp-review'})
    
    # Review Heading
    cmp_review_heading_cont = cmp_review.find('div', attrs={'class': 'cmp-review-heading'})   
    cmp_review_heading_title = cmp_review_heading_cont.find('span', attrs={'itemprop': 'name'}).getText()     
    
    # Review Overall Rating    
    cmp_review_heading_cont_span = cmp_review_heading_cont.find('span', attrs={'itemprop': 'reviewRating'})
    cmp_review_heading_rating = cmp_review_heading_cont_span.find('meta').attrs['content']    
      
    # Review Content
    cmp_review_cc = cmp_review.find('div', attrs={'class': 'cmp-review-content-container'})
    cmp_review_desc = cmp_review_cc.find('div', attrs={'class': 'cmp-review-description'})
    review_text = cmp_review_desc.find('span', attrs={'class':'cmp-review-text'}).getText()
    df = df.append({'comp_name': company_name, 'comment': review_text, 'Review Rating': cmp_review_heading_rating, 'Review Heading': cmp_review_heading_title},
    ignore_index=True)
   while isFlag:
    head_ele = target.find('head')
    next_href = head_ele.find('link', attrs={'rel': 'next'})
    if next_href != None:
     next_href = next_href.get('href')
     url = '%s%s' % (host, next_href)
     print '%s' % url
     target = Soup(urllib.urlopen(url), 'lxml')
     targetElements = target.findAll('div', attrs={'class': 'cmp-review-container'})     
     for elem in targetElements:      
      cmp_review = elem.find('div', attrs={'class': 'cmp-review'})
      
      # Review Heading
      cmp_review_heading_cont = cmp_review.find('div', attrs={'class': 'cmp-review-heading'})   
      cmp_review_heading_title = cmp_review_heading_cont.find('span', attrs={'itemprop': 'name'}).getText()     
    
      # Review Overall Rating    
      cmp_review_heading_cont_span = cmp_review_heading_cont.find('span', attrs={'itemprop': 'reviewRating'})
      cmp_review_heading_rating = cmp_review_heading_cont_span.find('meta').attrs['content']    
      
      cmp_review_cc = cmp_review.find('div', attrs={'class': 'cmp-review-content-container'})
      cmp_review_desc = cmp_review_cc.find('div', attrs={'class': 'cmp-review-description'})
      review_text = cmp_review_desc.find('span', attrs={'class':'cmp-review-text'}).getText()
      
      
      df = df.append({'comp_name': company_name, 'comment': review_text, 'Review Rating': cmp_review_heading_rating,'Review Heading': cmp_review_heading_title},
      ignore_index=True)
    else:
     isFlag = False
  except Exception as e:
   print("Unexpected error:", sys.exc_info()[0])

# Save the result to CSV
df.to_csv('D:\indeed_reviews.csv', encoding='utf-8')

       
 

No comments:

Post a Comment

Apache Spark – Catalyst Optimizer

Optimizer is the one that automatically finds out the most efficient plan to execute data operations specified in the user’s program. In...