40
loading...
This website collects cookies to deliver better user experience
Don't worry if you don't understand what's going on behind the scenes now that your project is up and running. Fill your coffee mug, because things are about to get interesting.
from flask import (
Flask,
render_template,
request,
redirect,
flash,
url_for,
current_app
)
import urllib.request
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup
import requests,validators,json ,uuid,pathlib,os
app = Flask(__name__)
@app.route("/",methods=("GET", "POST"), strict_slashes=False)
def index():
# Actual parsing codes goes here
return render_template("index.html")
if request.method == "POST":
try:
global requested_url,specific_element,tag
requested_url = request.form.get('urltext')
tag = request.form.get('specificElement')
source = requests.get(requested_url).text
soup = BeautifulSoup(source, "html.parser")
specific_element = soup.find_all(tag)
counter = len(specific_element)
image_paths = image_handler(
tag,
specific_element,
requested_url
)
return render_template("index.html",
url = requested_url,
counter=counter,
image_paths=image_paths,
results = specific_element
)
except Exception as e:
flash(e, "danger")
global requested_url,specific_element,tag
requested_url = request.form.get('urltext')
tag = request.form.get('specificElement')
requested_url
. The server answers the request by delivering the raw HTML content of the webpage, which we then transform to text- .text()
and assign to the variable source
.source = requests.get(requested_url).text
html.parser
as our parsing library. We're merely generating a nested/tree structure of the HTML data by doing this.soup = BeautifulSoup(source, "html.parser")
specific_element = soup.find_all(tag)
counter
to record the count, as demonstrated.counter = len(specific_element)
image_paths
is linked to a function called -image_handler()
, which accepts the user-supplied URL, tag, and the specific element we extracted from the parsed page. We'll skip this function and come back to it later to see what it does.image_paths = image_handler(
tag,
specific_element,
requested_url
)
return render_template("index.html",
url = requested_url,
counter=counter,
image_paths=image_paths,
results = specific_element
)
/uploads/image.png
, for example, will be difficult to determine where they originate. So we'll create a function similar to the one we called earlier to validate our image paths.def image_handler(tag,specific_element,requested_url):
image_paths = []
if tag == 'img':
images = [img['src'] for img in specific_element]
for i in specific_element:
image_path = i.attrs['src']
valid_imgpath = validators.url(image_path)
if valid_imgpath == True:
full_path = image_path
else:
full_path = urljoin(requested_url, image_path)
image_paths.append(full_path)
return image_paths
tag
is an image tag, then extracts the images' src attribute value and verifies it to see if it's an absolute path. If this is not the case, it joins the relative path to the target's base URL.https://example.com + /uploads/image.png
<div class="col-md-8">
<p><span class="badge bg-success">{{ counter }}</span> Scrap Results for <a
href="{{ url }}"> {{ url }}</a> </p>
<div class="bg-white shadow p-4 rounded results">
{% if results %}
{% for result in results %}
<p> {{ result | join(' ') }} </p>
{% endfor %}
{% endif %}
{% for path in image_paths %}
<a href=" {{ path }} "> <img src=" {{ path }} " class="img"> </a>
{% endfor %}
</div>
{% if image_paths %}
<a href="{{url_for('downloader')}}" class="btn btn-primary m-2" id="download">Download
Images<i class="bi bi-cloud-arrow-down-fill m-2"></i></a>
{% endif %}
</div>
| join(' ')
It operates similarly to |striptags
in that it removes all HTML tags from the variable result.href
and src
properties.{% for path in image_paths %}
<a href=" {{ path }} ">
<img src=" {{ path }} " class="img">
</a>
{% endfor %}
{% if image_paths %}
<a href="{{url_for('downloader')}}" class="btn btn-primary m-2" id="download">
Download Images
<i class="bi bi-cloud-arrow-down-fill m-2"></i>
</a>
{% endif %}
@app.route("/download",methods=("GET", "POST"), strict_slashes=False)
def downloader():
try:
for img in image_handler(tag,specific_element,requested_url):
image_url = img
filename = str(uuid.uuid4())
file_ext = pathlib.Path(image_url).suffix
picture_filename = filename + file_ext
downloads_path = str(pathlib.Path.home() / "Downloads")
picture_path = os.path.join(downloads_path, picture_filename
)
flash("Images saved in your Downloads directory", "success")
except Exception as e:
flash(e, "danger")
return redirect(url_for('index'))
uuid
library is used by the download function above to produce unique names for the downloaded files.filename = str(uuid.uuid4())
pathlib.Path()
to strip the image extension from the image path.file_ext = pathlib.Path(image_url).suffix
picture_path = os.path.join(downloads_path, picture_filename)
urllib.request.urlretrieve(image_url, picture_path)
urllib.request.urlretrieve()
accepts the image(s) to be downloaded and the directory where it should be saved with its new name. As a result, the photos are saved in the downloads directory within the static directory.if __name__ == "__main__":
app.run(debug=True)