Indeed is a Job Posting, Employee Comments and Reviews Social Forum. One can understand the sentiment of Employer by reading the comments.
So now a days, many companies wants to integrate their internal data warehouse with external data sets. Below is the Indeed Python Scrapper, that crawls the comments, ratings etc. and dump the output in csv file.
So now a days, many companies wants to integrate their internal data warehouse with external data sets. Below is the Indeed Python Scrapper, that crawls the comments, ratings etc. and dump the output in csv file.
# load the library
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, lxml, csv, sys
import pandas as pd
#http://www.indeed.com/cmp/Google/reviews?fcountry=US
df = pd.DataFrame() # create a new data frame
host = 'http://www.indeed.com'
cmp_tag = '/cmp/'
rev = '/reviews'
countryfilter = '?fcountry=US'
isFlag = True
with open('companies.csv', 'rb') as f:
reader = csv.reader(f)
for row in reader:
isFlag = True
try:
company_name = row[0]
# complete URL
url = '%s%s%s%s%s' % (host, cmp_tag, company_name, rev, countryfilter)
print "%s" %(url)
target = Soup(urllib.urlopen(url), 'lxml')
targetElements = target.findAll('div', attrs={'class': 'cmp-review-container'})
for elem in targetElements:
cmp_review = elem.find('div', attrs={'class': 'cmp-review'})
# Review Heading
cmp_review_heading_cont = cmp_review.find('div', attrs={'class': 'cmp-review-heading'})
cmp_review_heading_title = cmp_review_heading_cont.find('span', attrs={'itemprop': 'name'}).getText()
# Review Overall Rating
cmp_review_heading_cont_span = cmp_review_heading_cont.find('span', attrs={'itemprop': 'reviewRating'})
cmp_review_heading_rating = cmp_review_heading_cont_span.find('meta').attrs['content']
# Review Content
cmp_review_cc = cmp_review.find('div', attrs={'class': 'cmp-review-content-container'})
cmp_review_desc = cmp_review_cc.find('div', attrs={'class': 'cmp-review-description'})
review_text = cmp_review_desc.find('span', attrs={'class':'cmp-review-text'}).getText()
df = df.append({'comp_name': company_name, 'comment': review_text, 'Review Rating': cmp_review_heading_rating, 'Review Heading': cmp_review_heading_title},
ignore_index=True)
while isFlag:
head_ele = target.find('head')
next_href = head_ele.find('link', attrs={'rel': 'next'})
if next_href != None:
next_href = next_href.get('href')
url = '%s%s' % (host, next_href)
print '%s' % url
target = Soup(urllib.urlopen(url), 'lxml')
targetElements = target.findAll('div', attrs={'class': 'cmp-review-container'})
for elem in targetElements:
cmp_review = elem.find('div', attrs={'class': 'cmp-review'})
# Review Heading
cmp_review_heading_cont = cmp_review.find('div', attrs={'class': 'cmp-review-heading'})
cmp_review_heading_title = cmp_review_heading_cont.find('span', attrs={'itemprop': 'name'}).getText()
# Review Overall Rating
cmp_review_heading_cont_span = cmp_review_heading_cont.find('span', attrs={'itemprop': 'reviewRating'})
cmp_review_heading_rating = cmp_review_heading_cont_span.find('meta').attrs['content']
cmp_review_cc = cmp_review.find('div', attrs={'class': 'cmp-review-content-container'})
cmp_review_desc = cmp_review_cc.find('div', attrs={'class': 'cmp-review-description'})
review_text = cmp_review_desc.find('span', attrs={'class':'cmp-review-text'}).getText()
df = df.append({'comp_name': company_name, 'comment': review_text, 'Review Rating': cmp_review_heading_rating,'Review Heading': cmp_review_heading_title},
ignore_index=True)
else:
isFlag = False
except Exception as e:
print("Unexpected error:", sys.exc_info()[0])
# Save the result to CSV
df.to_csv('D:\indeed_reviews.csv', encoding='utf-8')