25
loading...
This website collects cookies to deliver better user experience
beautifulsoup
, requests
, lxml
, re
libraries. An alternative API solution will be shown.Note: this blog post shows how to scrape specific carousel layout that you'll see in "What will be scraped" section.
$ pip install requests
$ pip install lxml
$ pip install beautifulsoup4
$ pip install google-search-results
CSS
selectors because of select()
/select_one()
beautifulsoup
methods which accepts CSS
selectors. CSS
selectors reference.from bs4 import BeautifulSoup
import requests, lxml, re, json
from serpapi import GoogleSearch # API solution
g-img.img
or simply rISBZc
CSS
class to grab src
attribute, you'll get a data:image
URL, but it will be a 1x1 placeholder, instead of 120x120 image. <script>
tags, so we need to grab them somehow. But first, how on Earth do I think that thumbnails are located in the <script>
tags?id
value. id
value to find it.<script>
tags. That's what we're looking for.<script>
tags we need to use regex
and grab needed data in capture group:# grabbing every script element
all_script_tags = soup.select('script')
# quick and dirty regex
# https://regex101.com/r/NYdrL5/1/
matched_thumbnails = re.findall(r"<script nonce=\".*?\">\(\w+\(\)\{\w+\s?\w+='(.*?)';\w+\s?\w+=\['\w+'\];\w+\(\w+,\w+\);\}\)\(\);<\/script>", str(all_script_tags))
data:image
URLs needs to decoded in a loop:for thumbnail in thumbnails:
decoded_thumbnail = bytes(thumbnail, 'ascii').decode('unicode-escape')
ct5Ked
CSS
selector using for
loop and call specific data:for result in soup.select('.ct5Ked'):
title = result["aria-label"] # call aria-label attribute
link = f"https://www.google.com{result['href']}" # call href attribute
try:
# sometimes it's empty because of no result in Google output
extentions = result.select_one(".cp7THd .FozYP").text
except: extentions = None
for
loops. To do that, one of the easiest functions I find is to use zip()
:for result, thumbnail in zip(soup.select('.ct5Ked'), thumbnails):
title = result["aria-label"]
link = f"https://www.google.com{result['href']}"
try:
extentions = result.select_one(".cp7THd .FozYP").text
except: extentions = None
decoded_thumbnail = bytes(thumbnail, 'ascii').decode('unicode-escape')
from bs4 import BeautifulSoup
import requests, lxml, re, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
'q': 'dune actors',
'gl': 'us',
}
def get_top_carousel():
html = requests.get('https://www.google.com/search', headers=headers, params=params)
soup = BeautifulSoup(html.text, 'lxml')
carousel_name = soup.select_one('.F0gfrd+ .z4P7Tc').text
# creating hash before iterating over title, link, extensions
data = {f"{carousel_name}": []}
all_script_tags = soup.select('script')
thumbnails = re.findall(r"<script nonce=\"\w+\D{1,2}?\">\(\w+\(\)\{\w+\s?\w+='(.*?)';\w+\s?\w+=\['\w+'\];\w+\(\w+,\w+\);\}\)\(\);<\/script>", str(all_script_tags))
for result, thumbnail in zip(soup.select('.ct5Ked'), thumbnails):
title = result["aria-label"]
link = f"https://www.google.com{result['href']}"
try:
extensions = result.select_one(".cp7THd .FozYP").text
except: extensions = None
decoded_thumbnail = bytes(thumbnail, 'ascii').decode('unicode-escape')
# print(f'{title}\n{link}\n{extensions}\n{decoded_thumbnail}\n')
data[carousel_name].append({
'title': title,
'link': link,
'extentions': [extensions],
'thumbnail': decoded_thumbnail
})
print(json.dumps(data, indent=2, ensure_ascii=False))
get_top_carousel()
--------------------
# part of the output
'''
}
]
{
"name": "Timothée Chalamet",
"link": "https://www.google.com/search?hl=en&gl=us&q=Timoth%C3%A9e+Chalamet&stick=H4sIAAAAAAAAAONgFuLVT9c3NEzLqko2ii8xUOLSz9U3KDDKM0wr0BLKTrbST8vMyQUTVsmJxSWPGJcycgu8_HFPWGo246Q1J68xTmHkwqJOyJCLzTWvJLOkUkhQip8L1RIjEahAtll2hpFZXqHAwmWzGJWcjUx2XZp2jk1P8FkoA0Ndb4iDkiLnFCHrhswn7-wFXd__299ywsBBgkWBQYPB8JElq8P6KYwHtBgOMDI17VtxiI2Fg1GAwYpJg6mKiYOFZxGrUEhmbn5JxuGVqQrOGYk5ibmpJRPYGAHILgFT8gAAAA&sa=X&ved=2ahUKEwiMxLi-ksXzAhUAl2oFHf88AN0Q-BZ6BAgBEDQ",
"extensions": [
"Paul Atreides"
],
"thumbnail": "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBwgHBgkIBwgKCgkLDRYPDQwMDRsUFRAWIB0iIiAdHx8kKDQsJCYxJx8fLT0tMTU3Ojo6Iys/RD84QzQ5OjcBCgoKDQwNGg8PGjclHyU3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3Nzc3N//AABEIAHgAeAMBIgACEQEDEQH/xAAbAAABBQEBAAAAAAAAAAAAAAAEAAIDBQYBB//Ra8hFKlUWo8h+PocKVKlU4w//Z" # the URL is much longer, I shorten it on purpose.
}
]
}
'''
from serpapi import GoogleSearch
import os, json
def get_top_carousel():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "dune actors",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['knowledge_graph']['cast']:
print(json.dumps(result, indent=2))
get_top_carousel()
-------------
'''
# part of the output
{
"name": "Timothée Chalamet",
"extensions": [
"Paul Atreides"
],
"link": "https://www.google.com/search?hl=en&gl=us&q=Timoth%C3%A9e+Chalamet&stick=H4sIAAAAAAAAAONgFuLVT9c3NEzLqko2ii8xUOLSz9U3KDDKM0wr0BLKTrbST8vMyQUTVsmJxSWPGJcycgu8_HFPWGo246Q1J68xTmHkwqJOyJCLzTWvJLOkUkhQip8L1RIjEahAtll2hpFZXqHAwmWzGJWcjUx2XZp2jk1P8FkoA0Ndb4iDkiLnFCHrhswn7-wFXd__299ywsBBgkWBQYPB8JElq8P6KYwHtBgOMDI17VtxiI2Fg1GAwYpJg6mKiYOFZxGrUEhmbn5JxuGVqQrOGYk5ibmpJRPYGAHILgFT8gAAAA&sa=X&ved=2ahUKEwiMxLi-ksXzAhUAl2oFHf88AN0Q-BZ6BAgBEDQ",
"image": "https://serpapi.com/searches/6165a3dcfa86759a4fa42ba4/images/94afec67f82aa614bb572a123ec09cf051cf10bde8e0bc8025daf21915c49798.jpeg"
}
...
'''