25
loading...
This website collects cookies to deliver better user experience
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
walmart = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')
testing = pd.read_csv('test.csv')
merged = walmart.merge(stores, how='left').merge(features, how='left')
testing_merged = testing.merge(stores, how='left').merge(features, how='left')
def split_date(df):
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Day'] = df.Date.dt.day
df['WeekOfYear'] = (df.Date.dt.isocalendar().week)*1.0
split_date(merged)
split_date(testing_merged)
merged.info()
missing_values = merged.isna().sum()
px.bar(missing_values,
x=missing_values.index,
y=missing_values.values,
title="Missing Values",
labels=dict(x="Variable", y="Missing Values"))
Month of January witnessed the lowest sales for 2011 and 2012 while for 2010 the weekly sales are not given in the data.
From Feburary till October the weekly sales nearly remains constant around 15000 for the 3 years.
November and December showed the highest sales for 2010 and 2011 while for 2012 the sales data has not been provided.
storetype_values = {'A':3, 'B':2, 'C':1}
merged['Type_Numeric'] = merged.Type.map(storetype_values)
testing_merged['Type_Numeric'] = testing_merged.Type.map(storetype_values)
merged = merged.drop(['Date', 'Temperature','Fuel_Price', 'Type', 'MarkDown1', 'MarkDown2', 'MarkDown3',
'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Month', 'Day' ], axis=1)
testing_merged = testing_merged.drop(['Date', 'Temperature','Fuel_Price', 'Type', 'MarkDown1', 'MarkDown2', 'MarkDown3',
'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Month', 'Day' ], axis=1)
input_cols = merged.columns.to_list()
input_cols.remove('Weekly_Sales')
target_col = 'Weekly_Sales'
X = merged[input_cols].copy()
y = merged[target_col].copy()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(merged[input_cols])
X[input_cols] = scaler.transform(X[input_cols])
testing_merged[input_cols] = scaler.transform(testing_merged[input_cols])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42)
from sklearn.linear_model import LinearRegression
# Create and train the model
model = LinearRegression()
model.fit(X,y)
# Generate predictions on training data
train_preds = model.predict(X_train)
train_preds
array([17035.12035741, 15737.5350701 , 22990.63793901, ...,
20276.77727003, 21110.53004444, 23548.55581834])