tripadvisor-parser/tripadvisor_parser.py

132 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import math
import requests
from bs4 import BeautifulSoup, Tag
from tripadvisor_attraction import TripadvisorAttraction, TripadvisorReview
USERAGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0"
HEADERS = {"User-Agent": USERAGENT}
class ReviewsPageInformation:
page_size: int
page_count: int
def __init__(self, page_size, page_count):
self.page_size = page_size
self.page_count = page_count
class TripadvisorAttractionReviewParser:
reviews_page_information: ReviewsPageInformation
base_url: str
review_urls: [str]
raw_reviews: [str]
review_soups: [BeautifulSoup]
def __init__(self, reviews_page_information: ReviewsPageInformation, base_url: str):
self.reviews_page_information = reviews_page_information
self.base_url = base_url
self.review_urls = []
self.raw_reviews = []
self.review_soups = []
def parse_reviews(self) -> [TripadvisorReview]:
print("Constructing Review URLs")
self.construct_reviews_urls()
print("Requesting Reviews")
self.request_reviews()
print("Souping Reviews")
self.soup_reviews()
reviews = []
print("Parsing Reviews")
for review_soup in self.review_soups:
review_cards = review_soup.find_all("div", attrs={"data-automation": "reviewCard"})
parsed_review_cards = self.parse_review_card(review_cards)
reviews.extend(parsed_review_cards)
return reviews
def construct_reviews_urls(self):
[prefix, suffix] = self.base_url.split("-Reviews-")
self.review_urls.append(self.base_url)
for review_index in range(self.reviews_page_information.page_size,
self.reviews_page_information.page_size * self.reviews_page_information.page_count,
self.reviews_page_information.page_size):
self.review_urls.append(f'{prefix}-Reviews-or{review_index}-{suffix}')
def request_reviews(self):
for review_url in self.review_urls:
self.raw_reviews.append(requests.get(review_url, headers=HEADERS).text)
def soup_reviews(self):
for review in self.raw_reviews:
self.review_soups.append(BeautifulSoup(review, features="html.parser"))
def parse_review_card(self, soups: [Tag]) -> [TripadvisorReview]:
reviews = []
for soup in soups:
reviews.append(self.parse_review(soup))
return reviews
def parse_review(self, soup: Tag) -> TripadvisorReview:
[review_title, review_text] = soup.find_all("span", class_="yCeTE")
review = TripadvisorReview()
review.review_title = review_title.text
review.review_text = review_text.text
posting_date_tag = soup.find("div", class_="RpeCd")
if posting_date_tag is not None:
review.posting_date = posting_date_tag.text.split("")[0]
else:
review.posting_date = ""
review.username = soup.find("a", class_="ukgoS").text
review.count_stars = float(soup.find("svg", class_="H0").get("aria-label").split(" ")[0].replace(',', '.'))
review.count_likes = int(soup.find("span", class_="biGQs _P FwFXZ").text)
translation_hint = soup.find("span", class_="Ne d Vm")
if translation_hint is not None:
review.translated_by = translation_hint.img.get("alt")
else:
review.translated_by = ""
return review
class TripadvisorAttractionParser:
url: str
soup: BeautifulSoup
def __init__(self, url):
self.url = url
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
print("No status code 200")
html = response.text
self.soup = BeautifulSoup(html, features="html.parser")
def parse(self) -> TripadvisorAttraction:
attraction = TripadvisorAttraction()
attraction.title = self.parse_title()
attraction.count_of_reviews = self.parse_count_of_reviews()
attraction.reviews = self.parse_reviews()
return attraction
def parse_title(self) -> str:
return self.soup.find("h1", attrs={"data-automation": "mainH1"}).text
def parse_count_of_reviews(self) -> int:
return int(
self.soup.find("span", class_="KAVFZ").text.split("\xa0", maxsplit=2)[0].replace(',', '').replace('.', ''))
def parse_reviews(self) -> [TripadvisorReview]:
pagination_info = self.soup.find("div", class_="Ci").text
first_review_index = int(pagination_info.split(" ")[1].split(" ")[0])
last_review_index = int(pagination_info.split(" ")[3].split(" ")[0])
total_review_count = int(self.soup.find("button", class_="OKHdJ z Pc PQ Pp PD W _S Gn Z B2 BF _M PQFNM wSSLS").get("aria-label").split('(')[-1].split(')')[0].replace('.', ''))
page_size = last_review_index - first_review_index + 1
reviews_page_information = ReviewsPageInformation(page_size,
page_count=math.ceil(total_review_count / page_size))
return TripadvisorAttractionReviewParser(reviews_page_information, self.url).parse_reviews()