26
loading...
This website collects cookies to deliver better user experience
import requests
import random
urls = ["http://ident.me"] # ... more URLs
proxy_list = [
"54.37.160.88:1080",
"18.222.22.12:3128",
# ... more proxy IPs
]
for url in urls:
proxy = random.choice(proxy_list)
proxies = {"http": f"http://{proxy}", "https": f"http://{proxy}"}
response = requests.get(url, proxies=proxies)
print(response.text)
# prints 54.37.160.88 (or any other proxy IP)
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
"curl/7.74.0"
UA? You'd be skeptical at the very least.# ... same as above
user_agents = [
"Mozilla/5.0 (iPhone ...",
"Mozilla/5.0 (Windows ...",
# ... more User-Agents
]
for url in urls:
proxy = random.choice(proxy_list)
proxies = {"http": f"http://{proxy}", "https": f"http://{proxy}"}
response = requests.get(url, proxies=proxies)
print(response.text)
itemprop
data attributes. Others use hidden inputs for internal purposes (i.e., IDs, categories, product code), and you can take advantage. There's more than meets the eye.# ...
def extract_content(url, soup):
# ...
def store_content(url, content):
# ...
def allow_url_filter(url):
# ...
def get_html(url):
return headless_chromium.get_html(url, headers=random_headers(), proxies=random_proxies())
Crawl-Delay
. That directive is not common, but when present, represents the amount of seconds crawlers should wait between requests. There is a Python module that can help us to comply with robots.txt."Sec-Ch-Ua"
? 😕"Sec-Ch-Ua"
, but Puppeteer will since you overwrote UA but didn't delete that one.# ...
header_sets = [
{
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "no-cache",
"User-Agent": "Mozilla/5.0 (iPhone ...",
# ...
}, {
"User-Agent": "Mozilla/5.0 (Windows ...",
# ...
},
# ... more header sets
]
for url in urls:
# ...
headers = random.choice(header_sets)
response = requests.get(url, proxies=proxies, headers=headers)
print(response.text)