23
loading...
This website collects cookies to deliver better user experience
pip install
.pip install requests beautifulsoup4 pandas
requests.get
and BeautifulSoup
. We will start by finding the links in a fake shop prepared for testing scraping.set
. We chose set to avoid duplicates. As you can see, we hardcoded the selector for the links, meaning that it is not a universal solution. For the moment, we'll focus on the page at hand.import requests
from bs4 import BeautifulSoup
to_visit = set()
response = requests.get('https://scrapeme.live/shop/page/1/')
soup = BeautifulSoup(response.content, 'html.parser')
for a in soup.select('a.page-numbers'):
to_visit.add(a.get('href'))
print(to_visit)
# {'https://scrapeme.live/shop/page/2/', '.../3/', '.../46/', '.../48/', '.../4/', '.../47/'}
set
and avoid duplicates by checking them before every request. In this case, to_visit
is not being used, just maintained for demo purposes. To prevent visiting every page, we'll also add a max_visits
variable. For now, we ignore the robots.txt
file, but we have to be civil and nice.visited = set()
to_visit = set()
max_visits = 3
def crawl(url):
print('Crawl: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
visited.add(url)
for a in soup.select('a.page-numbers'):
link = a.get('href')
to_visit.add(link)
if link not in visited and len(visited) < max_visits:
crawl(link)
crawl('https://scrapeme.live/shop/page/1/')
print(visited) # {'.../3/', '.../1/', '.../2/'}
print(to_visit) # { ... new ones added, such as pages 5 and 6 ... }
try
block for security.def get_html(url):
try:
return requests.get(url).content
except Exception as e:
print(e)
return ''
def extract_links(soup):
return [a.get('href') for a in soup.select('a.page-numbers')
if a.get('href') not in visited]
def extract_content(soup):
for product in soup.select('.product'):
print(product.find('h2').text)
# Bulbasaur, Ivysaur, ...
def crawl(url):
if not url or url in visited:
return
print('Crawl: ', url)
visited.add(url)
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
extract_content(soup)
links = extract_links(soup)
to_visit.update(links)
crawl
function acts as an orchestrator by calling them and applying the results.crawl
is not recursive anymore, we'll handle that in a separate loop.to_visit.add('https://scrapeme.live/shop/page/1/')
while (len(to_visit) > 0 and len(visited) < max_visits):
crawl(to_visit.pop())
to_visit
). Since the data structure is not protected, both could read and write it like this:(1, 2, 3)
(simplified)4, 5
: (1, 2, 3, 4, 5)
6, 7
: (1, 2, 3, 6, 7)
import queue
q = queue.Queue()
q.put('https://scrapeme.live/shop/page/1/')
def crawl(url):
...
links = extract_links(soup)
for link in links:
if link not in visited:
q.put(link)
to_visit
with a queue. But queues need handlers or workers to process their content. With the above, we have created a Queue and added an item (the original one). We also modified the crawl
function to put links in the queue instead of updating the previous set.from threading import Thread
def queue_worker(i, q):
while True:
url = q.get() # Get an item from the queue, blocks until one is available
print('to process:', url)
q.task_done() # Notifies the queue that the item has been processed
q = queue.Queue()
Thread(target=queue_worker, args=(0, q), daemon=True).start()
q.put('https://scrapeme.live/shop/page/1/')
q.join() # Blocks until all items in the queue are processed and marked as done
print('Done')
# to process: https://scrapeme.live/shop/page/1/
# Done
get
an item, which will block until an item is available. We process that item; for the moment, just print it to show how it works. It will call crawl
later.task_done
.join
function does, "blocks until all items in the queue have been gotten and processed."def queue_worker(i, q):
while True:
url = q.get()
if (len(visited) < max_visits and url not in visited):
crawl(url)
q.task_done()
q = queue.Queue()
num_workers = 4
for i in range(num_workers):
Thread(target=queue_worker, args=(i, q), daemon=True).start()
num_workers
and max_visits
would start lots of requests. If the script had some minor bug for any reason, you could perform hundreds of requests in a few seconds.num_workers = 1
): 29,41snum_workers = 2
): 20,05snum_workers = 5
): 11,97snum_workers = 10
): 12,02sproxies = {
'http': 'http://190.64.18.177:80',
'https': 'http://49.12.2.178:3128',
}
headers = {
'authority': 'httpbin.org',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-US,en;q=0.9',
}
def get_html(url):
try:
response = requests.get(url, headers=headers, proxies=proxies)
return response.content
except Exception as e:
print(e)
return ''
data
array, which is not a great idea. But it is enough for demo purposes.data = []
def extract_content(soup):
for product in soup.select('.product'):
data.append({
'id': product.find('a', attrs={'data-product_id': True})['data-product_id'],
'name': product.find('h2').text,
'price': product.find(class_='amount').text
})
print(data)
# [{'id': '759', 'name': 'Bulbasaur', 'price': '£63.00'}, {'id': '729', 'name': 'Ivysaur', 'price': '£87.00'}, ...]
import requests
from bs4 import BeautifulSoup
import queue
from threading import Thread
starting_url = 'https://scrapeme.live/shop/page/1/'
visited = set()
max_visits = 100 # careful, it will crawl all the pages
num_workers = 5
data = []
def get_html(url):
try:
response = requests.get(url)
# response = requests.get(url, headers=headers, proxies=proxies)
return response.content
except Exception as e:
print(e)
return ''
def extract_links(soup):
return [a.get('href') for a in soup.select('a.page-numbers')
if a.get('href') not in visited]
def extract_content(soup):
for product in soup.select('.product'):
data.append({
'id': product.find('a', attrs={'data-product_id': True})['data-product_id'],
'name': product.find('h2').text,
'price': product.find(class_='amount').text
})
def crawl(url):
visited.add(url)
print('Crawl: ', url)
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
extract_content(soup)
links = extract_links(soup)
for link in links:
if link not in visited:
q.put(link)
def queue_worker(i, q):
while True:
url = q.get() # Get an item from the queue, blocks until one is available
if (len(visited) < max_visits and url not in visited):
crawl(url)
q.task_done() # Notifies the queue that the item has been processed
q = queue.Queue()
for i in range(num_workers):
Thread(target=queue_worker, args=(i, q), daemon=True).start()
q.put(starting_url)
q.join() # Blocks until all items in the queue are processed and marked as done
print('Done')
print('Visited:', visited)
print('Data:', data)