37
loading...
This website collects cookies to deliver better user experience
pip install numpy
pip install pandas
pip install requests
pip install bs4
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# create urls for all seasons of all leagues
base_url = 'https://understat.com/league'
leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1']
seasons = ['2016', '2017', '2018', '2019', '2020']
teamsData
variable and is JSON encoded. As a result, we'll need to track down this tag, extract JSON from it, and convert it to a Python-readable data structure.season_data = dict()
for season in seasons:
url = base_url+'/'+league+'/'+season
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
# Based on the structure of the webpage, I found that data is in the JSON variable, under <script> tags
scripts = soup.find_all('script')
string_with_json_obj = ''
# Find data for teams
for el in scripts:
if 'teamsData' in str(el):
string_with_json_obj = str(el).strip()
# strip unnecessary symbols and get only JSON data
ind_start = string_with_json_obj.index("('")+2
ind_end = string_with_json_obj.index("')")
json_data = string_with_json_obj[ind_start:ind_end]
json_data = json_data.encode('utf8').decode('unicode_escape')
#print(json_data)
# Get teams and their relevant ids and put them into separate dictionary
teams = {}
for id in data.keys():
teams[id] = data[id]['title']
columns = []
# Check the sample of values per each column
values = []
for id in data.keys():
columns = list(data[id]['history'][0].keys())
values = list(data[id]['history'][0].values())
break
# Getting data for all teams
dataframes = {}
for id, team in teams.items():
teams_data = []
for row in data[id]['history']:
teams_data.append(list(row.values()))
df = pd.DataFrame(teams_data, columns=columns)
dataframes[team] = df
# print('Added data for {}.'.format(team))
for team, df in dataframes.items():
dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
cols_to_sum = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
cols_to_mean = ['ppda_coef', 'oppda_coef']
for team, df in dataframes.items():
sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
final_df = sum_data.join(mean_data)
final_df['team'] = team
final_df['matches'] = len(df)
frames.append(final_df)
full_stat = pd.concat(frames)
full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'npxG', 'xGA', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts']]
full_stat.sort_values('pts', ascending=False, inplace=True)
full_stat.reset_index(inplace=True, drop=True)
full_stat['position'] = range(1,len(full_stat)+1)
full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
cols_to_int = ['wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'deep', 'deep_allowed']
full_stat[cols_to_int] = full_stat[cols_to_int].astype(int)
col_order = ['position', 'team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'xpts_diff']
full_stat = full_stat[col_order]
full_stat = full_stat.set_index('position')
# print(full_stat.head(20))
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# create urls for all seasons of all leagues
base_url = 'https://understat.com/league'
leagues = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1']
seasons = ['2016', '2017', '2018', '2019', '2020']
full_data = dict()
for league in leagues:
season_data = dict()
for season in seasons:
url = base_url+'/'+league+'/'+season
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
# Based on the structure of the webpage, I found that data is in the JSON variable, under <script> tags
scripts = soup.find_all('script')
string_with_json_obj = ''
# Find data for teams
for el in scripts:
if 'teamsData' in str(el):
string_with_json_obj = str(el).strip()
# print(string_with_json_obj)
# strip unnecessary symbols and get only JSON data
ind_start = string_with_json_obj.index("('")+2
ind_end = string_with_json_obj.index("')")
json_data = string_with_json_obj[ind_start:ind_end]
json_data = json_data.encode('utf8').decode('unicode_escape')
# convert JSON data into Python dictionary
data = json.loads(json_data)
# Get teams and their relevant ids and put them into separate dictionary
teams = {}
for id in data.keys():
teams[id] = data[id]['title']
# EDA to get a feeling of how the JSON is structured
# Column names are all the same, so we just use first element
columns = []
# Check the sample of values per each column
values = []
for id in data.keys():
columns = list(data[id]['history'][0].keys())
values = list(data[id]['history'][0].values())
break
# Getting data for all teams
dataframes = {}
for id, team in teams.items():
teams_data = []
for row in data[id]['history']:
teams_data.append(list(row.values()))
df = pd.DataFrame(teams_data, columns=columns)
dataframes[team] = df
# print('Added data for {}.'.format(team))
for team, df in dataframes.items():
dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
cols_to_sum = ['xG', 'xGA', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD']
cols_to_mean = ['ppda_coef', 'oppda_coef']
frames = []
for team, df in dataframes.items():
sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
final_df = sum_data.join(mean_data)
final_df['team'] = team
final_df['matches'] = len(df)
frames.append(final_df)
full_stat = pd.concat(frames)
full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'npxG', 'xGA', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts']]
full_stat.sort_values('pts', ascending=False, inplace=True)
full_stat.reset_index(inplace=True, drop=True)
full_stat['position'] = range(1,len(full_stat)+1)
full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
cols_to_int = ['wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'deep', 'deep_allowed']
full_stat[cols_to_int] = full_stat[cols_to_int].astype(int)
col_order = [‘league’,’year’,'position', 'team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'xpts_diff']
full_stat = full_stat[col_order]
full_stat = full_stat.set_index('position')
# print(full_stat.head(20))
season_data[season] = full_stat
df_season = pd.concat(season_data)
full_data[league] = df_season
data = pd.concat(full_data)
data.to_csv('understat.com.csv')
SOCCER_DATA
collection. The SOCCER_DATA
collection will store the data we extract to our CSV file in the database. For the History days
and TTL
, use the default values provided and save.check_name
and get_all_teams_data
. The get_all_teams_data
index will allow you to scroll through data in the SOCCER_DATA
collection. It has one term, which is the matches
field. This term will enable you to match data with the matches for easy querying.check_name
will allow you to scroll through data in the SOCCER_DATA
collection. This index will enable matching with the team
field to perform queries.New Key
to create a key. You will then be required to provide a database to connect to the key. After providing the information required, click the SAVE
button.django-admin startproject FAUNA_WEBSCRAPER
Django-admin startapp APP
APP
to installed apps in the settings.py file as seen in the code below:
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'APP',
]
urls.py
file with python code below:
from django.contrib import admin
from django.urls import path, include
from django.conf import settings
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.contrib.staticfiles.urls import static
urlpatterns = [
path('admin/', admin.site.urls),
path('', include("APP.urls")),
]
urlpatterns += staticfiles_urlpatterns()
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
urls.py
file in the APP
folder, then paste the python code below.
from django.conf import settings
from django.conf.urls.static import static
from django.urls import path, include
from . import views
app_name = "App"
urlpatterns = [
path("", views.index, name="index"),
]
APP
directory to handle the application user interface(i.e. templates
and static
folder).templates
folder, an HTML file and a CSS file. The HTML should be index.html
, and the CSS file should be style.css
. Copy and paste the code below in their respective files.
{% load static %}
<!DOCTYPE html>
<html>
<head>
<link href="{% static 'style.css' %}" rel="stylesheet">
<h1>Fauna Sport Aggregator</h1>
</head>
<body>
<table class="content-table">
<thead>
<tr>
<th>Position</th>
<th>Team</th>
<th>Matches</th>
<th>Wins</th>
<th>Draws</th>
<th>Loses</th>
<th>Points</th>
</tr>
</thead>
<tbody>
{% for team in team_list_data %}
<tr>
<td>{{team.position}}</td>
<td>{{team.team}}</td>
<td>{{team.matches}}</td>
<td>{{team.wins}}</td>
<td>{{team.draws}}</td>
<td>{{team.loses}}</td>
<td>{{team.points}}</td>
</tr>
{% endfor %}
</tbody>
</table>
</body>
</html>
#style.css
h1 {
color: #0000A0;
font-family: sans-serif;
}
.content-table {
border-collapse: collapse;
margin: 25px 0;
font-size: 1.0em;
font-weight: bold;
font-family: sans-serif;
min-width: 400px;
border-radius: 5px 5px 0 0;
overflow-hidden: hidden;
box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
}
.content-table thead tr {
background-color: #0000A0;
color: #FFFFFF;
text-align: left;
font-weight: bold;
}
.content-table th,
.content-table td {
padding: 15px 25px;
}
.content-table tbody tr {
border-bottom: 1px solid #dddddd;
}
.content-table tbody tr:nth-of-type(even) {
background-color: #f3f3f3;
color: #0000A0;
font-weight: bold
}
.content-table tbody tr:last-of-type {
border-bottom: 2px solid #0000A0;
}
.content-table tbody tr:hover {
background-color: #D3D3D3;
}
APP
folder.views.py
file.
from django.shortcuts import render
import csv
from faunadb import query as q
from faunadb.objects import Ref
from faunadb.client import FaunaClient
# Create your views here.
client = FaunaClient(secret="fauna_secret_key", domain="db.eu.fauna.com",
# NOTE: Use the correct domain for your database's Region Group.
port=443,
scheme="https")
def index(request):
team_data_list = []
headers_list = []
paginate_data=client.query(q.paginate(q.match(q.index("get_all_teams_data"), "38")))
all_data = [
q.get(
q.ref(q.collection("SOCCER_DATA"), paginate.id())
) for paginate in paginate_data["data"]
]
team_list_data = [i["data"] for i in client.query(all_data)]
context={"team_list_data":client.query(team_list_data)}
index = 0
with open("understat.com.csv", 'r') as data:
for line in csv.reader(data):
index += 1
if index > 1:
team_dict = {}
for i, elem in enumerate(headers_list):
team_dict[elem] = line[i]
team_data_list.append(team_dict)
else:
headers_list = list(line)
for record in team_data_list[:20]:
try:
team_data_check = client.query(q.get(q.match(q.index('check_name'),record["team"])))
except:
team_record = client.query(q.create(q.collection("SOCCER_DATA"), {
"data": {
"matches":record["matches"],
"position":record["position"],
"team":record["team"],
"wins": record["wins"],
"draws": record["draws"],
"loses":record["loses"],
"points":record["pts"]
}
}))
return render(request,"index.html",context)
python manage.py migrate
python manage.py runserver
SOCCER_DATA
collection should be like the one in the image below.from django.shortcuts import render
import csv
from faunadb import query as q
from faunadb.objects import Ref
from faunadb.client import FaunaClient
client = FaunaClient(secret="fnAETH3xELAAxp99WYA-8_xMLqFM1uMTfAwZYmZO", domain="db.eu.fauna.com",
# NOTE: Use the correct domain for your database's Region Group.
port=443,
scheme="https")
SOCCER_DATA
collection. We did this by reading and saving the data in the CSV file to a list variable team_data_list
in our program.team_data_list = []
headers_list = []
index = 0
with open("understat.com.csv", 'r') as data:
for line in csv.reader(data):
index += 1
if index > 1:
team_dict = {}
for i, elem in enumerate(headers_list):
team_dict[elem] = line[i]
team_data_list.append(team_dict)
else:
headers_list = list(line)
team_data_list
variable, then stored them in the SOCCER_DATA
collection. To do this, we iterated every data in the list and created a request with the Fauna client to save the data passed in the dictionary we defined to the collection. We only saved the first 20 data from the list for this tutorial, so we don’t exceed our free account rate limit.for record in team_data_list[:20]:
try:
team_data_check = client.query(q.get(q.match(q.index('check_name'),record["team"])))
except:
team_record = client.query(q.create(q.collection("SOCCER_DATA"), {
"data": {
"matches":record["matches"],
"position":record["position"],
"team":record["team"],
"wins": record["wins"],
"draws": record["draws"],
"loses":record["loses"],
"points":record["pts"]
}
}))
get_all_team_data1
Fauna index to match data for all teams with 38 matches. We paginated the queried data using Fauna’s paginate
method, then created a list variable called team_list_data
from it. Finally, we passed the list’s data to the front end user interface for display on a table.