26
loading...
This website collects cookies to deliver better user experience
This blog is suited for users with little web scraping experience.
beautifulsoup
, requests
, lxml
libraries.Note: This blog post shows how to extract data that is being shown in the what will be scraped section, and don't cover different layout handling (unless said otherwise).
pip install requests
pip install lxml
pip install beautifulsoup4
CSS
selectors because you'll see mostly usage of select()
/select_one()
beautifulsoup
methods that accept CSS
selectors. CSS
selectors by clicking on the desired element in the browser. CSS
selectors reference, or train on a few examples via CSS Diner.CSS
selector(s) or other HTML elements. CSS
selector(s) in SelectorGadget window, or via Dev Tools Console tab using $$(".SELECTOR")
which is equivalent to document.querySelectorAll(".SELECTOR")
to see if the correct elements being selected.import requests, lxml
from bs4 import BeautifulSoup
CSS
selectors or HTML elements from where to extract data.The main point of this is to make sure that IP won't be banned or blocked for some time, which will delay the script development process.
import requests
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"query": "minecraft",
"where": "news",
}
html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers).text
with open(f"{params['query']}_naver_news.html", mode="w") as file:
file.write(html)
import requests
user-agent
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
parameters
params = {
"query": "minecraft", # search query
"where": "news", # news results
}
user-agent
to request header..text
.html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers).text
with open(f"{params['query']}_naver_news.html", mode="w") as file:
file.write(html)
# output file will be minecraft_naver_news.html
CSS
selector of the container with all needed data such as title, link, etcfor news_result in soup.select(".list_news .bx"):
# further code
CSS
selector for title, link, etc. that will be used in extracting partfor news_result in soup.select(".list_news .bx"):
# hey, news_results, grab TEXT from every element with ".news_tit" selector
title = news_result.select_one(".news_tit").text
# hey, news_results, grab href (link) from every element with ".news_tit" selector
link = news_result.select_one(".news_tit")["href"]
# other elements..
import lxml, json
from bs4 import BeautifulSoup
with open("minecraft_naver_news.html", mode="r") as html_file:
html = html_file.read()
soup = BeautifulSoup(html, "lxml")
news_data = []
for news_result in soup.select(".list_news .bx"):
title = news_result.select_one(".news_tit").text
link = news_result.select_one(".news_tit")["href"]
thumbnail = news_result.select_one(".dsc_thumb img")["src"]
snippet = news_result.select_one(".news_dsc").text
press_name = news_result.select_one(".info.press").text
news_date = news_result.select_one("span.info").text
news_data.append({
"title": title,
"link": link,
"thumbnail": thumbnail,
"snippet": snippet,
"press_name": press_name,
"news_date": news_date
})
print(json.dumps(news_data, indent=2, ensure_ascii=False))
import lxml, json
from bs4 import BeautifulSoup
mode="w"
) to reading (mode="r"
) and pass it to BeautifulSoup()
so it can extract elements, and assigned "lxml"
as a HTML parser.with open("minecraft_naver_news.html", mode="r") as html_file:
html = html_file.read() # reading
soup = BeautifulSoup(html, "lxml")
news_data = []
CSS
selector that wraps other elements such as title, link, etc. inside itself with all the needed data, and extract it.# news_data = []
for news_result in soup.select(".list_news .bx"):
title = news_result.select_one(".news_tit").text
link = news_result.select_one(".news_tit")["href"]
thumbnail = news_result.select_one(".dsc_thumb img")["src"]
snippet = news_result.select_one(".news_dsc").text
press_name = news_result.select_one(".info.press").text
news_date = news_result.select_one("span.info").text
news_data.append({
"title": title,
"link": link,
"thumbnail": thumbnail,
"snippet": snippet,
"press_name": press_name,
"news_date": news_date
})
json.dumps()
, which in this case is just for pretty printing purpose.print(json.dumps(news_data, indent=2, ensure_ascii=False))
# part of the output
'''
[
{
"title": "Xbox, 11월부터 블록버스터 게임 연이어 출시",
"link": "http://www.gameshot.net/common/con_view.php?code=GA617793ce93c74",
"thumbnail": "https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F5739%2F2021%2F10%2F26%2F19571.jpg&type=ofullfill264_180_gray&expire=2&refresh=true",
"snippet": " 마인크래프트(Minecraft) – 11월 3일(한국 시간) 마인크래프트는 11월 3일 Xbox Game Pass PC용에 추가될 예정이며, 새로운 마인크래프트 던전스 시즈널 어드벤처(Minecraft Dungeons Seasonal Adventures), 동굴과... ",
"press_name": "게임샷",
"news_date": "6일 전"
}
# other results...
]
'''
for news in news_data:
title = news["title"]
# link, snippet, thumbnail..
print(title)
# prints all titles that was appended to the list()
import lxml, json, requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"query": "minecraft",
"where": "news",
}
# function that parses content from local copy of html
def extract_news_from_html():
with open("minecraft_naver_news.html", mode="r") as html_file:
html = html_file.read()
# calls naver_parser() function to parse the page
data = naver_parser(html)
print(json.dumps(data, indent=2, ensure_ascii=False))
# function that makes an actual request
def extract_naver_news_from_url():
html = requests.get("https://search.naver.com/search.naver", params=params, headers=headers)
# calls naver_parser() function to parse the page
data = naver_parser(html)
print(json.dumps(data, indent=2, ensure_ascii=False))
# parser that accepts html argument from extract_news_from_html() or extract_naver_news_from_url()
def naver_parser(html):
soup = BeautifulSoup(html.text, "lxml")
news_data = []
for news_result in soup.select(".list_news .bx"):
title = news_result.select_one(".news_tit").text
link = news_result.select_one(".news_tit")["href"]
thumbnail = news_result.select_one(".dsc_thumb img")["src"]
snippet = news_result.select_one(".news_dsc").text
press_name = news_result.select_one(".info.press").text
news_date = news_result.select_one("span.info").text
news_data.append({
"title": title,
"link": link,
"thumbnail": thumbnail,
"snippet": snippet,
"press_name": press_name,
"news_date": news_date
})
return news_data
pip install google-search-results
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"),
"engine": "naver",
"query": "Minecraft",
"where": "news"
}
search = GoogleSearch(params) # where extraction happens
results = search.get_dict() # where structured json appears
news_data = []
for news_result in results["news_results"]:
title = news_result["title"]
link = news_result["link"]
thumbnail = news_result["thumbnail"]
snippet = news_result["snippet"]
press_name = news_result["news_info"]["press_name"]
date_news_poseted = news_result["news_info"]["news_date"]
news_data.append({
"title": title,
"link": link,
"thumbnail": thumbnail,
"snippet": snippet,
"press_name": press_name,
"news_date": date_news_poseted
})
print(json.dumps(news_data, indent=2, ensure_ascii=False))
from serpapi import GoogleSearch
import os
import json # in this case used for pretty printing
os
library stands for operating system (miscellaneous operating system interfaces), and os.getenv(SECRET_KEY)
return the value of the environment variable key if it exists.Note this parameters will be different depending on what "engine"
you're using (except, in this case, "api_key"
, "query"
).
params = {
"api_key": os.getenv("API_KEY"), # API key that being stored in the environment variable
"engine": "naver", # search engine
"query": "Minecraft", # search query
"where": "news" # news results filter
# other parameters
}
news_data = []
CSS
selectors, we're extracting data from the dictionary (provided from SerpApi) by their key
.for news_result in results["news_results"]:
title = news_result["title"]
link = news_result["link"]
thumbnail = news_result["thumbnail"]
snippet = news_result["snippet"]
press_name = news_result["news_info"]["press_name"]
date_news_poseted = news_result["news_info"]["news_date"]
news_data.append({
"title": title,
"link": link,
"thumbnail": thumbnail,
"snippet": snippet,
"press_name": press_name,
"news_date": date_news_poseted
})
print(json.dumps(news_data, indent=2, ensure_ascii=False))
---------------
'''
[
{
"title": "Xbox, 11월부터 블록버스터 게임 연이어 출시",
"link": "http://www.gameshot.net/common/con_view.php?code=GA617793ce93c74",
"thumbnail": "https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F5739%2F2021%2F10%2F26%2F19571.jpg&type=ofullfill264_180_gray&expire=2&refresh=true",
"snippet": " 마인크래프트(Minecraft) – 11월 3일(한국 시간) 마인크래프트는 11월 3일 Xbox Game Pass PC용에 추가될 예정이며, 새로운 마인크래프트 던전스 시즈널 어드벤처(Minecraft Dungeons Seasonal Adventures), 동굴과... ",
"press_name": "게임샷",
"news_date": "6일 전"
}
# other results...
]
'''