27
loading...
This website collects cookies to deliver better user experience
None of this required a database. We used files as our "database" but mostly we used Airtable.
class="btn btn-primary"
. Interesting, we've found a pattern. Great! We can work with that. import requests
response = requests.get("https://kalob.io/teaching/")
print(response.content)
Note: utf-8 encoding is most commonly used on the internet. So we'll want to decode the HTML we scraped into utf-8 compatible text (in a giant string)
import requests
response = requests.get("https://kalob.io/teaching/")
html = response.content.decode("utf-8")
print(html)
attr=""
some people use attr=''
some people use XHTML and some don't. pip install beautifulsoup4
import requests
response = requests.get("https://kalob.io/teaching/")
html = response.content.decode("utf-8")
import bs4 # You'll need to `pip install `
soup = bs4.BeautifulSoup(html, "html.parser")
print(soup) # Shows the parsed HTML
print(type(soup)) # Returns <class 'bs4.BeautifulSoup'>
soup
variable is no longer a string, but an object. This means we can use object methods on it - like looking for certain elements in the HTML we scraped. import requests
response = requests.get("https://kalob.io/teaching/")
html = response.content.decode("utf-8")
import bs4 # You'll need to `pip install `
soup = bs4.BeautifulSoup(html, "html.parser")
courses = soup.findAll("a", {"class": ["btn btn-primary"]})
print(courses)
for course in courses:
print(course.get("href"))
print(course.text.strip())
print("\n")
href
attribute, and the innerText
without any whitespace. import requests
import bs4 # You'll need to `pip install `
response = requests.get("https://kalob.io/teaching/")
html = response.content.decode("utf-8")
soup = bs4.BeautifulSoup(html, "html.parser")
courses = soup.findAll("a", {"class": ["btn btn-primary"]})
for course in courses:
print(f"{course.get('href')} -> {course.text.strip()}")
airtable-python-wrapper
. pip install airtable-python-wrapper
from airtable.airtable import Airtable
airtable = Airtable('appXXXXXXXXX', 'Links', 'keyXXXXXXXXXX')
import requests
import bs4 # You'll need to `pip install `
from airtable.airtable import Airtable
response = requests.get("https://kalob.io/teaching/")
html = response.content.decode("utf-8")
soup = bs4.BeautifulSoup(html, "html.parser")
courses = soup.findAll("a", {"class": ["btn btn-primary"]})
airtable = Airtable('appXXXXXXXXX', 'Links', 'keyXXXXXXXXXX')
for course in courses:
new_record = {
"Link": course.get('href'),
"Text": course.text.strip(),
}
airtable.insert(new_record)