27
loading...
This website collects cookies to deliver better user experience
pip install selenium
from selenium import webdriver
url = "http://zenrows.com"
with webdriver.Chrome() as driver:
driver.get(url)
print(driver.current_url) # https://www.zenrows.com/
print(driver.title) # Web Scraping API & Data Extraction - ZenRows
/usr/bin/
).chrome_driver_path = '/path/to/chromedriver'
with webdriver.Chrome(executable_path=chrome_driver_path) as driver:
# ...
options = webdriver.ChromeOptions()
options.headless = True
with webdriver.Chrome(options=options) as driver:
# ...
driver.find_element(By.TAG_NAME, "input")
. But this might be a problem since there are several inputs on the page. By inspecting the page, we see that it has an ID, so we change the selector: driver.find_element(By.ID, "twotabsearchtextbox")
.from selenium import webdriver
from selenium.webdriver.common.by import By
url = "https://www.amazon.com/"
with webdriver.Chrome(options=options) as driver:
driver.get(url)
input = driver.find_element(By.CSS_SELECTOR,
"form[role='search'] input[type='text']")
a-list-item
. We need a similar function (find_elements
in plural) to match all the items and not just the first occurrence.#...
driver.get(url)
items = driver.find_elements(By.CLASS_NAME, "a-list-item")
send_keys
function that will type and hit enter to send the form. We could also type into the input and then find the submit button and click on it (element.click()
). It is easier in this case since the Enter
works fine.from selenium.webdriver.common.keys import Keys
#...
input = driver.find_element(By.CSS_SELECTOR,
"form[role='search'] input[type='text']")
input.send_keys('Python Books' + Keys.ENTER)
find_elements
as above. Inspecting the result, we can use the s-result-item
class.div
s with several inner tags. We could take the link's href
values if interested and visit each item - we won't do that for the moment. But the h2
tags contain the book's title, so we need to select the title for each element. We can continue using find_element
since it will work for driver
, as seen before, and for any web element.# ...
items = driver.find_elements(By.CLASS_NAME, "s-result-item")
for item in items:
h2 = item.find_element(By.TAG_NAME, "h2")
print(h2.text) # Prints a list of around fifty items
# Learning Python, 5th Edition ...
body
and send the keys there.driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
WebDriverWait
to put the script on hold until some criteria are met.img
element as soon as it appears. The driver will wait for 3 seconds and fail otherwise.from selenium.webdriver.support.ui import WebDriverWait
# ...
el = WebDriverWait(driver, timeout=3).until(
lambda d: d.find_element(By.TAG_NAME, "img"))
element_to_be_clickable
is an excellent example in a page full of Javascript, since many buttons are not interactive until some actions occur.from selenium.webdriver.support import expected_conditions as EC
#...
button = WebDriverWait(driver, 3).until(
EC.element_to_be_clickable((By.CLASS_NAME, 'my-button')))
# ...
driver.save_screenshot('page.png')
# ...
card = driver.find_element(By.CLASS_NAME, "a-cardui")
card.screenshot("amazon_card.png")
driver.get_window_size()
, which will print {'width': 800, 'height': 600}
. When using GUI, those numbers will change, so let's assume that we're testing headless mode.set_window_size
- that will modify the window size. Or we can add an options argument to the Chrome web driver that will directly start the browser with that resolution.options.add_argument("--window-size=1024,768")
with webdriver.Chrome(options=options) as driver:
print(driver.get_window_size())
# {'width': 1024, 'height': 768}
driver.set_window_size(1920,1200)
driver.get(url)
print(driver.get_window_size())
# {'width': 1920, 'height': 1200}
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
options.add_argument('user-agent=%s' % user_agent)
with webdriver.Chrome(options=options) as driver:
driver.get(url)
print(driver.find_element(By.TAG_NAME, "body").text) # UA matches the one hardcoded above, v93
sec-ch-ua
header usually sends a version of the browser, and it must much the user-agent's one: "Google Chrome";v="96"
. But some older versions do not send that header at all, so sending it might also be suspicious.pip install selenium-wire
.from seleniumwire import webdriver
url = "http://httpbin.org/anything"
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
sec_ch_ua = '"Google Chrome";v="93", " Not;A Brand";v="99", "Chromium";v="93"'
referer = 'https://www.google.com'
options = webdriver.ChromeOptions()
options.headless = True
def interceptor(request):
del request.headers['user-agent'] # Delete the header first
request.headers['user-agent'] = user_agent
request.headers['sec-ch-ua'] = sec_ch_ua
request.headers['referer'] = referer
with webdriver.Chrome(options=options) as driver:
driver.request_interceptor = interceptor
driver.get(url)
print(driver.find_element(By.TAG_NAME, "body").text)
from selenium import webdriver
# ...
url = "http://httpbin.org/ip"
proxy = '85.159.48.170:40014' # free proxy
options.add_argument('--proxy-server=%s' % proxy)
with webdriver.Chrome(options=options) as driver:
driver.get(url)
print(driver.find_element(By.TAG_NAME, "body").text) # "origin": "85.159.48.170"
proxy_pass = "YOUR_API_KEY"
seleniumwire_options = {
'proxy': {
"http": f"http://{proxy_pass}:@proxy.zenrows.com:8001",
'verify_ssl': False,
},
}
with webdriver.Chrome(options=options,
seleniumwire_options=seleniumwire_options) as driver:
driver.get(url)
print(driver.find_element(By.TAG_NAME, "body").text)
driver.proxy
can be overwritten. From that point on, all requests will use the new proxy. This action can be done as many times as necessary. For convenience and reliability, we advocate for Smart Rotating Proxies.#...
driver.get(url) # Initial proxy
driver.proxy = {
'http': 'http://user:[email protected]:5678',
}
driver.get(url) # New proxy
from selenium import webdriver
url = "https://www.amazon.com/"
options = webdriver.ChromeOptions()
options.headless = True
options.experimental_options["prefs"] = {
"profile.managed_default_content_settings.images": 2
}
with webdriver.Chrome(options=options) as driver:
driver.get(url)
driver.save_screenshot('amazon_without_images.png')
options.experimental_options["prefs"] = {
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.stylesheets": 2,
"profile.managed_default_content_settings.javascript": 2,
"profile.managed_default_content_settings.cookies": 2,
"profile.managed_default_content_settings.geolocation": 2,
"profile.default_content_setting_values.notifications": 2,
}
exclude_hosts
or allow only specific requests based on URLs matching against a regular expression with driver.scopes
.def interceptor(request):
# Block PNG and GIF images, will show JPEG for example
if request.path.endswith(('.png', '.gif')):
request.abort()
with webdriver.Chrome(options=options) as driver:
driver.request_interceptor = interceptor
driver.get(url)
execute_script
passing the JS code we want to be executed. It can go without params or with elements as params.navigator
object since some security checks might raise red flags otherwise. The second one will take an h2
as an argument and return its left position by accessing getClientRects
.with webdriver.Chrome(options=options) as driver:
driver.get(url)
agent = driver.execute_script("return navigator.userAgent")
print(agent) # Mozilla/5.0 ... Chrome/96 ...
header = driver.find_element(By.CSS_SELECTOR, "h2")
headerText = driver.execute_script(
'return arguments[0].getClientRects()[0].left', header)
print(headerText) # 242.5