32
loading...
This website collects cookies to deliver better user experience
beautifulsoup
, requests
, lxml
libraries. Note: This blog post shows how to extract data that is being shown in the what will be scraped section.
pip install requests
pip install lxml
pip install beautifulsoup4
CSS
selectors because you'll see mostly usage of select()
/select_one()
beautifulsoup
methods that accept CSS
selectors.
CSS
selectors to cover what it is, pros and cons, and why they're matter from a web-scraping perspective.import requests, lxml
from bs4 import BeautifulSoup
CSS
selectors for all the needed data.requests.get("URL").text
) or ban permanently.import requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"query": "bruce lee",
"where": "web" # theres's also a "nexearch" param that will produce different results
}
def save_naver_organic_results():
html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers).text
# replacing every space to underline (_) so bruce lee will become bruce_lee
query = params['query'].replace(" ", "_")
with open(f"{query}_naver_organic_results.html", mode="w") as file:
file.write(html)
import requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
# query parameters
params = {
"query": "bruce lee",
"where": "web"
}
requests.get(params=params)
instead of leaving them in the URL. I find it more readable, for example, let's look at the exact same URL:params = {
"where": "web",
"sm": "top_hty",
"fbm": "1",
"ie": "utf8",
"query": "bruce+lee"
}
requests.get("https://search.naver.com/search.naver", params=params)
# VS
requests.get("https://search.naver.com/search.naver?where=web&sm=top_hty&fbm=1&ie=utf8&query=bruce+lee") # Press F.
user-agent
, it's needed to act as a "real" user visit otherwise the request might be denied. You can read more about it in my other blog post about how to reduce the chance of being blocked while web scraping search engines.CSS
selector that wraps all needed data), title, link, displayed link, and a snippet.for result in soup.select(".total_wrap"):
title = result.select_one(".total_tit").text.strip()
link = result.select_one(".total_tit .link_tit")["href"]
displayed_link = result.select_one(".total_source").text.strip()
snippet = result.select_one(".dsc_txt").text
import lxml, json
from bs4 import BeautifulSoup
def extract_local_html_naver_organic_results():
with open("bruce_lee_naver_organic_results.html", mode="r") as html_file:
html = html_file.read()
soup = BeautifulSoup(html, "lxml")
data = []
for index, result in enumerate(soup.select(".total_wrap")):
title = result.select_one(".total_tit").text.strip()
link = result.select_one(".total_tit .link_tit")["href"]
displayed_link = result.select_one(".total_source").text.strip()
snippet = result.select_one(".dsc_txt").text
data.append({
"position": index + 1, # starts from 1, not from 0
"title": title,
"link": link,
"displayed_link": displayed_link,
"snippet": snippet
})
print(json.dumps(data, indent=2, ensure_ascii=False))
import lxml, json
from bs4 import BeautifulSoup
with open("bruce_lee_naver_organic_results.html", mode="r") as html_file:
html = html_file.read()
soup = BeautifulSoup(html, "lxml")
data = []
enumerate()
method which adds a counter to an iterable and returns it. More examples.grocery = ["bread", "milk", "butter"] # iterable
for index, item in enumerate(grocery):
print(f"{index} {item}\n")
'''
0 bread
1 milk
2 butter
'''
# in our case iterable is soup.select() since it returns an iterable as well
for index, result in enumerate(soup.select(".total_wrap")):
title = result.select_one(".total_tit").text.strip()
link = result.select_one(".total_tit .link_tit")["href"]
displayed_link = result.select_one(".total_source").text.strip()
snippet = result.select_one(".dsc_txt").text
data.append({
"position": index + 1, # starts from 1, not from 0
"title": title,
"link": link,
"displayed_link": displayed_link,
"snippet": snippet
})
Note: first and second function could be skipped if you don't really want to do that but take in mind possible consequences that was mentioned above.
import requests
import lxml, json
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"query": "bruce lee", # search query
"where": "web" # nexearch will produce different results
}
# function that saves HTML locally
def save_naver_organic_results():
html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers).text
# replacing every spaces so bruce lee will become bruce_lee
query = params['query'].replace(" ", "_")
with open(f"{query}_naver_organic_results.html", mode="w") as file:
file.write(html)
# fucntion that opens local HTML and calls a parser function
def extract_naver_organic_results_from_html():
with open("bruce_lee_naver_organic_results.html", mode="r") as html_file:
html = html_file.read()
# calls naver_organic_results_parser() function to parse the page
data = naver_organic_results_parser(html)
print(json.dumps(data, indent=2, ensure_ascii=False))
# function that make an actual request and calls a parser function
def extract_naver_organic_results_from_url():
html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers)
# calls naver_organic_results_parser() function to parse the page
data = naver_organic_results_parser(html)
print(json.dumps(data, indent=2, ensure_ascii=False))
# parser that's being called by 2-3 functions
def naver_organic_results_parser(html):
soup = BeautifulSoup(html.text, "lxml")
data = []
for index, result in enumerate(soup.select(".total_wrap")):
title = result.select_one(".total_tit").text.strip()
link = result.select_one(".total_tit .link_tit")["href"]
displayed_link = result.select_one(".total_source").text.strip()
snippet = result.select_one(".dsc_txt").text
data.append({
"position": index + 1, # starts from 1, not from 0
"title": title,
"link": link,
"displayed_link": displayed_link,
"snippet": snippet
})
return data
CSS
selectors and don't get pissed off when certain selectors don't work as you expected, plus there's no need to maintain the parser over time if something in the HTML will be changed and on the next run the script will blow up with an error.pip install google-search-results
from serpapi import GoogleSearch
import os, json
def serpapi_get_naver_organic_results():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "naver", # search engine (Google, Bing, DuckDuckGo..)
"query": "Bruce Lee", # search query
"where": "web"
}
search = GoogleSearch(params)
results = search.get_dict()
data = []
for result in results["organic_results"]:
data.append({
"position": result["position"],
"title": result["title"],
"link": result["link"],
"displayed_link": result["displayed_link"],
"snippet": result["snippet"]
})
print(json.dumps(data, indent=2, ensure_ascii=False))
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"),
"engine": "naver", # search engine (Google, Bing, DuckDuckGo..)
"query": "Bruce Lee", # search query
"where": "web" # filter to extract data from organic results
}
search = GoogleSearch(params) # data extraction
results = search.get_dict() # structured JSON which is being called later
data = []
for result in results["organic_results"]:
data.append({
"position": result["position"],
"title": result["title"],
"link": result["link"],
"displayed_link": result["displayed_link"],
"snippet": result["snippet"]
})
print(json.dumps(data, indent=2, ensure_ascii=False))
# ----------------
# part of the output
'''
[
{
"position": 1,
"title": "Bruce Lee",
"link": "https://brucelee.com/",
"displayed_link": "brucelee.com",
"snippet": "New Podcast Episode: #402 Flowing with Dustin Nguyen Watch + Listen to Episode “Your inspiration continues to guide us toward our personal liberation.” - Bruce Lee - More Podcast Episodes HBO Announces Order For Season 3 of Warrior! WARRIOR Seasons 1 & 2 Streaming Now on HBO & HBO Max “Warrior is still the best show you’re"
}
# other results..
]
'''