34
loading...
This website collects cookies to deliver better user experience
Memanggil library di Python
import library_name as alias
example:
import numpy as np # memanggil library numpy sebagai np
Exploratory Data Analysis dengan Pandas - Part 1
Salah satu fungsi Pandas yaitu melakukan load data dari CSV atau Excel file. Syntax yang digunakan untuk melakukan operasi tersebut, yaitu:
import pandas as pd
# Membaca file CSV
variable_name = pd.read_csv("file_name.csv")
# Membaca file Excel
variable_name = pd.read_excel("file_name.xlsx")
import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
print([nama_dataframe].shape)
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.shape
(2, 2)
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
... 'col3': [5, 6]})
>>> df.shape
(2, 3)
import pandas as pd
order_df = pd.read_csc("https://storage.googleapis.com/dqlab-dataset/order.csv")
print(order_df.shape)
head()
⇒ mengembalikan n baris pertama untuk objek berdasarkan posisi. Ini berguna untuk menguji dengan cepat apakah objek anda memiliki tipe data yang tepat di dalamnya. default 5 baris.tail()
⇒ mengembalikan n baris terakhir dari objek berdasarkan posisi. Ini berguna untuk memverifikasi data dengan cepet, misalnya, setelah menyortir atau menambahkan baris. default 5 baris.# Menampilkan konten teratas dari [nama_dataframe]
# untuk sejumlah bilangan bulat [jumlah_data]
print([name_dataframe].head(jumlah_data))
# Menampilkan konten terbawah dari [nama_dataframe]
# untuk sejumlah bilangan bulat [jumlah_baris]
print([name_dataframe].tail(jumlah_data))
head()
dengan limit 10 baris!import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
print(order_df.head(10))
describe()
, yaitu:print([name_dataframe].describe())
>>> data_df.describe()
data_df.describe()
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
describe()
akan secara otomatis mengabaikan kolom category dan hanya memberikan summary statistik untuk kolom berjenis numerik.include="all"
untuk mendapatkan summary statistik atau statistik deskriptif dari kolom numerik dan karakter.print([name_dataframe].describe(include="all"))
>>> data_df.describe(include='all')
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200 200.000000 200.000000 200.000000
unique NaN 2 NaN NaN NaN
top NaN Female NaN NaN NaN
freq NaN 112 NaN NaN NaN
mean 100.500000 NaN 38.850000 60.560000 50.200000
std 57.879185 NaN 13.969007 26.264721 25.823522
min 1.000000 NaN 18.000000 15.000000 1.000000
25% 50.750000 NaN 28.750000 41.500000 34.750000
50% 100.500000 NaN 36.000000 61.500000 50.000000
75% 150.250000 NaN 49.000000 78.000000 73.000000
max 200.000000 NaN 70.000000 137.000000 99.000000
include=['object']
pada describe()
.print(nilai_skor_df.describe(include=['object']))
>>> data_df.describe(include=['object'])
Genre
count 200
unique 2
top Female
freq 112
[mean]
[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html], [median]
[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.median.html], dan [mode]
[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mode.html] dari Pandas.print([nama_dataframe].loc[:, "nama_kolom"].mean())
print([nama_dataframe].loc[:, "nama_kolom"].median())
print([nama_dataframe].loc[:, "nama_kolom"].mode())
>>> data_df.loc[:, "Age"].mean()
38.85
>>> data_df.loc[:, "Age"].median()
36.0
>>> data_df.loc[:, "Age"].mode()
0 32
dtype: int64
import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# summary bulanan dari segi kuantitas, harga frright value, dan weight
print(order_df.describe())
# Median dari total pembelian konsumen per transaksi kolom price
print(order_df.loc[:, "price"].median())
nama_dataframe[["nama_kolom"]].hist(
bins=jumlah_bin,
by=nama_kolom,
alpha=nilai_alpha,
figsize=tuple_ukuran_gambar
)
figsize: tuple_ukuran_gambar yang digunakan untuk menentukan ukuran dari plot histogram. Contoh: figsize=(10, 12)
[x] Tugas praktek 5
import pandas as pd
import matplotlib.pyplot as plt
order_df = pd.read_csv("order.csv")
order_df[['price']].hist(figsize=(4,5), bins=10, xlabelsize=8, ylabelsize=8)
plt.show()
print([name_dataframe].loc[:, "nama_kolom"].std())
print([nama_dataframe].loc[:, "nama_kolom"].var())
>>> shop_df.loc[:, "Annual Income (k$)"].std()
26.264721165271244
>>> shop_df.loc[:, "Annual Income (k$)"].var()
689.8355778894472
import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# Standar variasi kolom product_weight_gram
order_df.loc[:, "product_weight_gram"].std()
# Varians kolom product_weight_gram
order_df.loc[:, "product_weight_gram"].var()
Q1 = nama_dataframe.quartile(0.25)
Q3 = nama_dataframe.quartile(0.75)
IQR = Q3 - Q1
print(IQR)
>>> Q1 = shopping_df[["Annual Income (k$)"]].quantile(0.25)
>>> Q3 = shopping_df[["Annual Income (k$)"]].quantile(0.75)
>>> IQR = Q3 - Q1
>>> IQR
Annual Income (k$) 36.5
dtype: float64
>>> (shopping_df < (Q1 - 1.5 * IQR)) | (shopping_df > (Q3 + 1.5 * IQR))
<stdin>:1: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`
<stdin>:1: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`
Age Annual Income (k$) CustomerID Genre Spending Score (1-100)
0 False False False False False
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False False False
.. ... ... ... ... ...
195 False False False False False
196 False False False False False
197 False False False False False
198 False True False False False
199 False True False False False
import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# Hitung quartile 1
Q1 = order_df[["product_weight_gram"]].quantile(0.25)
# Hitung quartile 3
Q3 = order_df[["product_weight_gram"]].quantile(0.75)
# Hitung inter quartile range dan cetak ke console
IQR = Q3 - Q1
print(IQR)
1. Menggunakan nama kolom
2. Menggunakan indeks kolom
nama_dataframe.rename(columns={"column_name_before": "column_name_after"}, inplace=True)
nilai_skor_df.rename(columns={"Age": "Umur"}, inplace=True)
```
{% endraw %}
2. **Rename menggunakan indeks kolom**
syntax:
{% raw %}
```python
nama_dataframe.columns.values[no_of_column] = "column_name_after"
nilai_skor_df.columns.values[0] = "Umur"
rename()
.import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# Ganti nama kolom freight_value menjadi shipping_cost
order_df.rename(columns={"freight_value": "shipping_cost"}, inplace=True)
print(order_df)
.groupby
adalah mencari summary dari data frame dengan menggunakan aggregate dari kolom tertentu.>>> shopping_data["Age"].groupby([shopping_data["Genre"]]).mean()
Genre
Female 38.098214
Male 39.806818
Name: Age, dtype: float64
>>> order_df["freight_value"].groupby([order_df["product_category_name"], order_df["order_status"]]).sum()
product_category_name order_status
automotive canceled 2212000
delivered 640559000
invoiced 2909000
processing 4152000
shipped 9012000
beauty canceled 4089000
delivered 626722000
invoiced 3132000
processing 3016000
shipped 7613000
unavailable 186000
electronics approved 10000
canceled 3993000
delivered 638651000
invoiced 3823000
processing 3462000
shipped 8280000
unavailable 194000
fashion canceled 3582000
delivered 635846000
invoiced 3951000
processing 3434000
shipped 7192000
unavailable 53000
gadget approved 53000
canceled 3294000
delivered 634502000
invoiced 1787000
processing 4236000
shipped 6386000
sports canceled 3634000
delivered 633185000
invoiced 2652000
processing 3396000
shipped 6758000
unavailable 207000
toys canceled 3381000
delivered 645547000
invoiced 3584000
processing 3518000
shipped 6627000
utilities approved 39000
canceled 2537000
delivered 633463000
invoiced 1494000
processing 3544000
shipped 6068000
Name: freight_value, dtype: int64
import pandas as pd
order_df = pd.read_csv("[https://storage.googleapis.com/dqlab-dataset/order.csv](https://storage.googleapis.com/dqlab-dataset/order.csv)")
rata_rata = order_df["price"].groupby(order_df["payment_type"]).mean()
print(rata_rata)
nama_dataframe.sort_values(by="nama_kolom")
>>> shopping_data.sort_values(by="Age")
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
114 115 Female 18 65 48
91 92 Male 18 59 41
65 66 Male 18 48 59
33 34 Male 18 33 92
0 1 Male 19 15 39
.. ... ... ... ... ...
90 91 Female 68 59 55
108 109 Male 68 63 43
57 58 Male 69 44 46
70 71 Male 70 49 55
60 61 Male 70 46 56
nama_dataframe.sort_value(by="nama_kolom", ascending=False)
>>> shopping_data.sort_values(by="Age", ascending=False)
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
70 71 Male 70 49 55
60 61 Male 70 46 56
57 58 Male 69 44 46
90 91 Female 68 59 55
67 68 Female 68 48 48
.. ... ... ... ... ...
0 1 Male 19 15 39
33 34 Male 18 33 92
65 66 Male 18 48 59
91 92 Male 18 59 41
114 115 Female 18 65 48
nama_dataframe.sort_values(by=["nama_kolom1", "nama_kolom2"], ascending=[False, True])
>>> shopping_data.sort_values(by=["Age", "Spending Score (1-100)"], ascending=[False, True])
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
70 71 Male 70 49 55
60 61 Male 70 46 56
57 58 Male 69 44 46
108 109 Male 68 63 43
67 68 Female 68 48 48
.. ... ... ... ... ...
68 69 Male 19 48 59
91 92 Male 18 59 41
114 115 Female 18 65 48
65 66 Male 18 48 59
33 34 Male 18 33 92
[200 rows x 5 columns]
import pandas as pd
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# Hitung harga maksimum pembelian customer
sort_harga = order_df.groupby(by="price", ascending=False)
print(sort_harga)
import pandas as pd
import matplotlib.pyplot as plt
order_df = pd.read_csv("https://storage.googleapis.com/dqlab-dataset/order.csv")
# Median price yang dibayar customer dari masing-masing metode pembayaran.
median_price = order_df["price"].groupby(order_df["payment_type"]).median()
print(median_price)
# Ubah freight_value menjadi shipping_cost dan cari shipping_cost
# termahal dari data penjualan tersebut menggunakan sort.
order_df.rename(columns={"freight_value": "shipping_cost"}, inplace=True)
sort_value = order_df.sort_values(by="shipping_cost", ascending=0)
print(sort_value)
# Untuk product_category_name, berapa rata-rata weight produk tersebut
# dan standar deviasi mana yang terkecil dari weight tersebut,
mean_value = order_df["product_weight_gram"].groupby(order_df["product_category_name"]).mean()
print(mean_value.sort_values())
std_value = order_df["product_weight_gram"].groupby(order_df["product_category_name"]).std()
print(std_value.sort_values())
# Buat histogram quantity penjualan dari dataset tersebutuntuk melihat persebaran quantity
# penjualan tersebut dengan bins = 5 dan figsize= (4,5)
order_df[["quantity"]].hist(figsize=(4, 5), bins=5)
plt.show()