20
loading...
This website collects cookies to deliver better user experience
selenium
library. An alternative API solution will be shown.selenium
library and regular expressions.from selenium import webdriver
import urllib.parse, re
CSS
selectors. id
selector followed by a class
selector. CSS selectors reference.CSS
selectors.regex
which you can see in action on regex101 or in the online IDE. re.findall(pattern, string) # returns an array
re.finditer(pattern, string) # returns an iterator
findall()
method you can't specify .group()
including named groups while finditer()
method can.urllib.parse.unquote()
method, that's an easy one.# encoded
>>> https%3A%2F%2Fimage.cnbcfm.com%2Fapi%2Fv1%2Fimage%2F106261274-1574442599483rtx7a0ls.jpg%3Fv%3D1574452686
# decoded
>>> https://image.cnbcfm.com/api/v1/image/106261274-1574442599483rtx7a0ls.jpg?v=1574452686
from selenium import webdriver
import urllib.parse, re
driver = webdriver.Chrome(executable_path='C:/Users/dimit/PycharmProjects/pythonProject/Scrape Search Engines/Walmart/chromedriver.exe')
driver.get('https://duckduckgo.com/?q=elon musk&kl=us-en&ia=web')
for result in driver.find_elements_by_css_selector('#m1-0 .has-image'):
title = result.find_element_by_css_selector('#m1-0 .js-carousel-item-title').text.strip()
link = result.find_element_by_css_selector('#m1-0 .js-carousel-item-title').get_attribute('href')
source = result.find_element_by_css_selector('#m1-0 .result__url').text
date = result.find_element_by_css_selector('#m1-0 .tile__time').text
thumbnail_encoded = result.find_element_by_css_selector('#m1-0 .module--carousel__image').get_attribute('style')
# https://regex101.com/r/98r2qW/1
match_thumbnail_urls = ''.join(re.findall(r'background-image: url\(\"\/\/external-content\.duckduckgo\.com\/iu\/\?u=(.*)&f=1&h=110\"\);', thumbnail_encoded))
# https://www.kite.com/python/answers/how-to-decode-a-utf-8-url-in-python
thumbnail = urllib.parse.unquote(match_thumbnail_urls)
print(f'{title}\n{link}\n{source}\n{date}\n{thumbnail}\n')
driver.quit()
-------------------
'''
Elon Musk admits Tesla's Cybertruck could flop
https://www.cnbc.com/2021/07/15/elon-musk-admits-the-cybertruck-could-flop.html
CNBC
4h
https://image.cnbcfm.com/api/v1/image/106261274-1574442599483rtx7a0ls.jpg?v=1574452686
'''
JSON
string.from serpapi import GoogleSearch
import json # for pretty printing
params = {
"api_key": "YOUR_API_KEY",
"engine": "duckduckgo",
"q": "elon musk",
"kl": "us-en"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['news_results'], indent=2, ensure_ascii=False))
------------------------
'''
[
{
"position": 1,
"title": "Elon Musk admits Tesla's Cybertruck could flop",
"link": "https://www.cnbc.com/2021/07/15/elon-musk-admits-the-cybertruck-could-flop.html",
"snippet": "Tesla CEO Elon Musk admitted Thursday on Twitter that the Cybertruck might flop but said he doesn't care because he loves its unusual trapezoid-like design.",
"source": "CNBC",
"date": "4 hours ago",
"thumbnail": "https://image.cnbcfm.com/api/v1/image/106261274-1574442599483rtx7a0ls.jpg?v=1574452686"
}
]
'''