-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_data.py
41 lines (31 loc) · 1.4 KB
/
split_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def split_data(file_path = None, df = None, prefix="", seed=1, ret = 0):
"""
Splits data into Train, Test and Validation and saves them in the ProcessedData folder
Args:
file_path: Input Data in csv format
df: Dataframe can be sent instead of a file
prefix: Prefix for file names like:
[Prefix]_train.csv, [Prefix]_test.csv, [Prefix]_val.csv
seed: seed to randomize the split
ret: set to 1 if you want the data_frames_back as well
Returns:
Tuple of Train, Val and Test pandas dataframes.
"""
if file_path is not None:
data = pd.read_csv(file_path)
data.drop(columns=['Unnamed: 0'], inplace = True)
elif df is not None:
data = df
else:
print("Error : Please Provide a Datafile/Frame")
train, test = train_test_split(data, test_size=0.25, random_state=seed, shuffle=False)
train, val = train_test_split(train, test_size=0.25, random_state=seed, shuffle=False)
dir_prefix = 'ProcessedData/'
train.to_csv(dir_prefix+prefix+"_train.csv", index=False, header=False)
val.to_csv(dir_prefix+prefix+"_val.csv", index=False, header=False)
test.to_csv(dir_prefix+prefix+"_test.csv", index=False, header=False)
if ret == 1:
return train, val, test