This repository was archived by the owner on May 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgoogle.py
More file actions
160 lines (138 loc) · 5.92 KB
/
google.py
File metadata and controls
160 lines (138 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import re
import time
import dateparser
import requests
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
def scroll(driver, timeout, element):
scroll_pause_time = timeout
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
def scrape_google_reviews(page_url, driver):
# Get's page set in page_url and opens it
driver.get(page_url)
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
data = {}
data['listing'] = []
data['reviews'] = []
# Find the name of the listing and captures in name
name = driver.find_element_by_xpath("//h2[starts-with(@class, 'qrShPb kno-ecr-pt PZPZlf mfMhoc')]//span").text
# Find the total number of reviews and captures in number_of_reviews
number_of_reviews = re.findall("\d+",
driver.find_element_by_xpath("//span[@class='hqzQac']//span//a//span").text.replace(
',', ''))[0]
# Find the average review score
average_review = \
re.findall("\d+", driver.find_element_by_xpath("//span[@class='Aq14fc']").text.replace('.', ''))[
0]
# Checks address is provided on listing
if check_exists_by_xpath("//span[@class='LrzXr']"):
# Set's address as address on listing
address = driver.find_element_by_xpath(
"//span[@class='LrzXr']").text
else:
# Set's address as None
address = None
# Checks telephone number provided on listing
if check_exists_by_xpath("//span[@class='LrzXr zdqRlf kno-fv']"):
# Set's telephone as telephone on listing
telephone = driver.find_element_by_xpath("//span[@class='LrzXr zdqRlf kno-fv']").text
else:
# Set's telephone as None
telephone = None
# Checks website url provide on listing
if check_exists_by_xpath("//a[text()='Website']"):
# Set's website as website on listing
redirect_url = driver.find_element_by_xpath(
"//a[text()='Website']").get_attribute(
"href")
website = redirect_url
else:
# Set's website as None
website = None
# Add's the listing details above as json object
data['listing'].append({
'name': name,
'number_of_reviews': number_of_reviews,
'average_review': average_review,
'address': address,
'telephone': telephone,
'website': website,
})
driver.find_element_by_xpath("//span[contains(text(), 'Google reviews')]").click()
time.sleep(5)
driver.find_element_by_xpath("//span[contains(text(),'Most relevant')]").click()
time.sleep(5)
driver.find_element_by_xpath("//div[contains(text(),'Newest')]").click()
element_inside_popup = driver.find_element_by_xpath("//div[@class='review-dialog-list']")
time.sleep(2)
while True: # scroll 5 times
last_top = driver.execute_script('return arguments[0].scrollTop;', element_inside_popup)
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollTop + 1000;', element_inside_popup)
time.sleep(2)
new_top = driver.execute_script('return arguments[0].scrollTop;', element_inside_popup)
if new_top == last_top:
# If heights are the same it will exit the function
break
reviews = driver.find_elements_by_xpath("//div[@jscontroller='e6Mltc']")
for y in reviews:
# Checks if review has user id, not all do, mainly old reviews
try:
# Scrapes user id element
userid_element = y.find_element_by_xpath(
'.//div[@class="TSUbDb"]')
# Extracts text from user id element
userid = userid_element.text
except NoSuchElementException:
# Sets user id as none
userid = None
# Scrapes review date element
review_date_element = y.find_element_by_xpath('.//span[@class="dehysf"]')
# Extract's review date from title attribute
review_date = review_date_element.text
try:
review_body_element = y.find_element_by_xpath(
'.//span[@class="review-full-text"]')
review_body = review_body_element.text
except NoSuchElementException:
try:
# Scrapes review body element
review_body_element = y.find_element_by_xpath(
'.//div[@class="Jtu6Td"]')
# Extracts review body text
review_body = review_body_element.text
except NoSuchElementException:
pass
# Scrapes review rating element
review_rating_element = y.find_element_by_xpath(
'.//span[@class="Fam1ne EBe2gf"]')
# Extracts review rating from class, the end of the class name is the rating e.g. bubble_40
review_rating = re.findall("\d+", review_rating_element.get_attribute("aria-label"))[0]
# Append the list with the data extracted
data['reviews'].append({
'userid': userid,
'review_date': review_date,
'review_body': review_body,
'review_rating': review_rating
})
driver.quit()
return data