24
loading...
This website collects cookies to deliver better user experience
import S&P-500 companies’ data with the Yahoo! Finance API into a Jupyter Notebook
setup a connection to CrateDB with Python
create functions to create tables, insert values, and retrieve data from CrateDB
upload finance market data into CrateDB
cd /crate-4.6.1
./bin/crate
.pip3 --version
, pip3 install notebook
jupyter notebook
print('Hello World!')
Alt
+ Enter
(or clicking on the Run button)download
function, which gets either a ticker symbol or a list of those as a parameter and downloads the data from these companies. import requests
from bs4 import BeautifulSoup
def get_sp500_ticker_symbols():
# getting html from SP500 Companies List wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
r = requests.get(url,timeout = 2.5)
r_html = r.text
soup = BeautifulSoup(r_html, 'html.parser')
# getting rows from wikipedia's table
components_table = soup.find_all(id = "constituents")
data_rows = components_table[0].find("tbody").find_all("tr")[1:]
# extracting ticker symbols from the data rows
tickers = []
for row in range(len(data_rows)):
stock = list(filter(None, data_rows[row].text.split("\n")))
symbol = stock[0]
if (symbol.find('.') != -1):
symbol = symbol.replace('.', '-')
tickers.append(symbol)
tickers.sort()
return tickers
data_rows
variabledata_rows
into the stock
list, where each element of the list contains information about one stock (Symbol, Security, SEC filings, …)stock
list element and adds it to the tickers
list tickers
list in alphabetical order and returns ittickers = get_sp500_ticker_symbols()
print(tickers)
yfinance
.import yfinance as yf
import pandas as pd
download
function returns a pandas.DataFrame object containing different kinds of information for a company, such as Date (which is the index for the DataFrame), Adjusted Close, High, Low, among others. download_YFinance_data
function:def download_YFinance_data(last_date):
tickers = get_sp500_ticker_symbols()
# downloading data from yfinance
data = yf.download(tickers, start = last_date)['Adj Close']
data.index.names = ['closing_date']
data.reset_index(inplace = True)
return data
my_data
variable, and print the result:my_data = download_YFinance_data('2021-11-16')
print(my_data)
import psycopg2 as ps
import math
math
package, which will be necessary for further steps. connect
function:host_name = 'localhost'
dbname = 'doc'
port = '5432'
username = 'crate'
password = ''
def connect_to_crateDB(host_name, dbname, port, username, password):
try:
conn = ps.connect(host = host_name, database = dbname, user = username,
password = password, port = port)
except ps.Error as e:
raise e
else:
print("Connected!")
return conn
conn
variable, which stores the connection, and a curr
cursor variable, which allows Python code to execute PostgreSQL commands.conn = connect_to_crateDB(host_name, dbname, port, username, password)
curr = conn.cursor()
CREATE TABLE IF NOT EXISTS
in my function.curr.execute
command:def create_table(table_name):
columns = "(closing_date TIMESTAMP, ticker TEXT, adjusted_close FLOAT)"
statement = "CREATE TABLE IF NOT EXISTS \"" + table_name + "\"" + columns + ";"
curr.execute(statement)
create_table('test_table')
df
: that way I can edit the DataFrame without changing the original data. Then, I create a table with the table_name (a new table will only be created in case there is no table with such a name). Finally, I create a ticker_symbols
variable, which stores the list of company symbols.
def insert_values(table_name, data):
df = data.copy()
# creates a new table (in case it does not exist yet)
create_table(table_name)
ticker_symbols = get_sp500_ticker_symbols()
df
so that it matches a supported type, in this case, timestamp (without time zone). # formatting date entries to match timestamp format
df['closing_date'] = df['closing_date'].astype('|S')
for i in range(len(df['closing_date'])):
df['closing_date'][i] = "'{}'".format(df['closing_date'][i] + "{time}".format(time = "T00:00:00Z"))
df
and store it as a list in date_values
, create the first part of the insert statement in insert_stmt
and create an empty array values_array
, where I will keep the value tuples for each company on that date. (closing_date, ticker, adjusted_close)
tuple for each company and adds it to my values_array
. values_array
to my insert_stmt
and separate all tuples with commas to match the statement standards.curr.execute(insert_stmt)
# formatting data to fit into insert statement and creating statement
for i in range(len(df)):
# saving entries from the ith line as a list of date values
date_values = df.iloc[i, :]
# first part of the insert statement
insert_stmt = "INSERT INTO \"{}\" (closing_date, ticker, adjusted_close) VALUES ".format(table_name)
# creating array of value tuples, as [(c1, c2, c3), (c4, c5, c6), ...]
values_array = []
for k in range(len(ticker_symbols)):
ticker = ticker_symbols[k]
# the date is always the first value in a row
closing_date = date_values[0]
# index is [k+1] because the date itself is in the first position ([0]), so all values are shifted
adj_close = date_values[k+1]
# checking if there is a NaN entry and setting it to -1
if (math.isnan(adj_close)):
adj_close = -1;
# putting a comma between tuples, but not on the last tuple
values_array.append("({},\'{}\',{})".format(closing_date, ticker, adj_close))
insert_stmt += ", ".join(values_array) + ";"
print(insert_stmt)
curr.execute(insert_stmt)
insert_values('test_table', my_data)
insert_values
function:def insert_values(table_name, data):
df = data.copy()
# creates a new table (in case it does not exist yet)
create_table(table_name)
ticker_symbols = get_sp500_ticker_symbols()
# formatting date entries to match timestamp format
df['closing_date'] = df['closing_date'].astype('|S')
for i in range(len(df['closing_date'])):
df['closing_date'][i] = "'{}'".format(df['closing_date'][i] + "{time}".format(time = "T00:00:00Z"))
# formatting data to fit into insert statement and creating statement
for i in range(len(df)):
# saving entries from the ith line as a list of date values
date_values = df.iloc[i, :]
# first part of the insert statement
insert_stmt = "INSERT INTO \"{}\" (closing_date, ticker, adjusted_close) VALUES ".format(table_name)
# creating array of value tuples, as [(c1, c2, c3), (c4, c5, c6), ...]
values_array = []
for k in range(len(ticker_symbols)):
ticker = ticker_symbols[k]
# the date is always the first value in a row
closing_date = date_values[0]
# index is [k+1] because the date itself is in the first position ([0]), so all values are shifted
adj_close = date_values[k+1]
# checking if there is a NaN entry and setting it to -1
if (math.isnan(adj_close)):
adj_close = -1;
# putting a comma between values tuples, but not on the last tuple
values_array.append("({},\'{}\',{})".format(closing_date, ticker, adj_close))
insert_stmt += ", ".join(values_array) + ";"
curr.execute(insert_stmt)
download_YFinance_data
function: this way, this function will only download new data!def select_last_inserted_date(table_name):
# creating table (only in case it does not exist yet)
create_table(table_name)
# selecting the maximum date in my table
statement = "select max(closing_date) from " + table_name + ";"
curr.execute(statement)
# fetching the results from the query
recent_date = curr.fetchall()
# if the query is empty or the date is None, start by 2015/01/01
if (len(recent_date) == 0 or recent_date[0][0] is None):
print("No data yet, will return: 2015-01-01")
return "2015-01-01"
# format date from timestamp to YYYY-MM-DD
last_date = recent_date[0][0].strftime("%Y-%m-%d")
# printing the last date
print("Most recent data on CrateDB from: " + last_date)
return last_date
select_last_inserted_date('test_table')
# Connecting to CrateDB
conn = connect_to_crateDB(host_name, dbname, port, username, password)
curr = conn.cursor()
table_name = "sp500"
# Creating S&P-500 table
create_table(table_name)
# Getting most recent date from sp500 table
last_date = select_last_inserted_date(table_name)
# downloading the data from the last_date on
data = download_YFinance_data(last_date)
# inserting the data values into CrateDB
insert_values(table_name, data)
select *
from sp500
where ticker = 'AAPL'
order by closing_date limit 100;