20
loading...
This website collects cookies to deliver better user experience
# main part, not complete code
def populate_data_frame_in_prediction_time(data, columns):
unknown_col = "nan"
columns_set = set(columns)
result_data_frame = pd.DataFrame(0, index=np.arange(len(data)), columns=columns)
for prefix in data.columns: # O(m)
unknown_column_name = str(prefix) + "_" + str(unknown_col)
for index, row in data.iterrows(): #O(n)
value = row[prefix]
result_column_name = str(prefix) + "_" + str(value)
if result_column_name not in columns_set: # O(1)
result_column_name = unknown_column_name
result_data_frame[result_column_name][index] = 1
result_data_frame = result_data_frame.astype('uint8')
return result_data_frame
def _custom_one_hot_encoding_1d(series, column_list):
unknown_col = "nan"
prefix = series.name
number_of_rows, number_of_col = series.shape[0], len(column_list)
dummy_data = np.array([np.zeros(number_of_rows, dtype=int)] * number_of_col).T
df = pd.DataFrame(dummy_data, columns=column_list)
for index, name in series.items():
if not name:
name = unknown_col
column_name = str(prefix) + "_" + str(name)
if column_name not in df:
column_name = prefix + "_" + unknown_col
df[column_name][index] = 1
return df
def _custom_one_hot_encoding_1d(series, column_list):
unknown_col = "nan"
prefix = series.name
number_of_rows, number_of_col = series.shape[0], len(column_list)
column_idx = {column_list[i]: i for i in range(len(column_list))}
result_arr = np.array([np.zeros(number_of_rows, dtype=int)] * number_of_col).T
for index, name in series.items():
if not name:
name = unknown_col
column_name = str(prefix) + "_" + str(name)
if column_name not in column_list:
column_name = prefix + "_" + unknown_col
result_arr[index][column_idx[column_name]] = 1
df = pd.DataFrame(result_arr, columns=column_list)
return df