19
loading...
This website collects cookies to deliver better user experience
pip install
.pip install requests beautifulsoup4 pandas
import requests
from bs4 import BeautifulSoup
response = requests.get("https://zenrows.com")
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.title.string) # Web Data Automation Made Easy - ZenRows
with open("test.html") as fp:
soup = BeautifulSoup(fp, "html.parser")
print(soup.title.string) # Web Data Automation Made Easy - ZenRows
interactionCount = soup.find('meta', itemprop="interactionCount")
print(interactionCount['content']) # 8566042
datePublished = soup.find('meta', itemprop="datePublished")
print(datePublished['content']) # 2014-01-09
internalLinks = [
a.get('href') for a in soup.find_all('a')
if a.get('href') and a.get('href').startswith('/')]
print(internalLinks)
links = [a.get('href') for a in soup.find_all('a')]
to_extract = ["facebook.com", "twitter.com", "mailto:"]
social_links = []
for link in links:
for social in to_extract:
if link and social in link:
social_links.append(link)
print(social_links)
# ['mailto:****@webscraper.io',
# 'https://www.facebook.com/webscraperio/',
# 'https://twitter.com/webscraperio']
[email protected]
.co.uk
.emails = re.findall(
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",
str(soup))
print(emails) # ['****@webscraper.io', '****@webscraper.io']
table = soup.find("table", class_="sortable")
output = []
for row in table.findAll("tr"):
new_row = []
for cell in row.findAll(["td", "th"]):
for sup in cell.findAll('sup'):
sup.extract()
for collapsible in cell.findAll(
class_="mw-collapsible-content"):
collapsible.extract()
new_row.append(cell.get_text().strip())
output.append(new_row)
print(output)
# [
# ['Artist', 'Album', 'Released', ...],
# ['Michael Jackson', 'Thriller', '1982', ...]
# ]
pandas
and import directly the HTML, as shown below. It will handle everything for us: the first line will match the headers, and the rest will be inserted as content with the right type. read_html
returns an array, so we take the first item and then remove a column that has no content.import pandas as pd
table_df = pd.read_html(str(table))[0]
table_df = table_df.drop('Ref(s)', 1)
print(table_df.columns) # ['Artist', 'Album', 'Released' ...
print(table_df.dtypes) # ... Released int64 ...
print(table_df['Claimed sales*'].sum()) # 422
print(table_df.loc[3])
# Artist Pink Floyd
# Album The Dark Side of the Moon
# Released 1973
# Genre Progressive rock
# Total certified copies... 24.4
# Claimed sales* 45
actors = soup.find(class_="item-starring").find(
class_="title-data-info-item-list")
print(actors.text.split(','))
# ['Henry Cavill', 'Anya Chalotra', 'Freya Allan']
import json
ldJson = soup.find("script", type="application/ld+json")
parsedJson = json.loads(ldJson.contents[0])
print([actor['name'] for actor in parsedJson['actors']])
# [... 'Jodhi May', 'MyAnna Buring', 'Joey Batey' ...]
"header ul > li"
, right? It will work. But we need Javascript rendering for that since it is not present on the first load. As stated before, we should try to avoid that.metaDescription = soup.find("meta", {'name': 'description'})
print(metaDescription['content'])
# 87.9m Followers, 0 Following, 493 Posts ...
itemprop="availability"
. As for the brand, the same snippet as the one used for YouTube but changing the property name to "brand."brand = soup.find('meta', itemprop="brand")
print(brand['content']) # Tesla
import json
ldJson = soup.find("script", type="application/ld+json")
parsedJson = json.loads(ldJson.contents[0])
print(parsedJson["aggregateRating"]["ratingValue"]) # 4.9
print(parsedJson["aggregateRating"]["reviewCount"]) # 57
print(parsedJson["weight"]) # 0.492kg -> extra, not visible in UI
products = []
cards = soup.find_all(class_="card")
for card in cards:
products.append({
'id': card.get('data-entity-id'),
'name': card.get('data-name'),
'category': card.get('data-product-category'),
'price': card.get('data-product-price')
})
print(products)
# [
# {
# "category": "Wood Bats, Wood Bats/Professional Cuts",
# "id": "1945",
# "name": "6 Bat USA Professional Cut Bundle",
# "price": "579.99"
# },
# {
# "category": "Wood Bats, Wood Bats/Pro Model",
# "id": "1804",
# "name": "M-71 Pro Model",
# "price": "159.99"
# },
# ...
# ]