Skip to content
Snippets Groups Projects
Commit 23ab3bf1 authored by Laura Christine Kühle's avatar Laura Christine Kühle
Browse files

Built Snakemake pipeline for training data generation and model training.

parent 25dd8d99
No related branches found
No related tags found
No related merge requests found
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
@author: Soraya Terrab (sorayaterrab), Laura C. Kühle @author: Soraya Terrab (sorayaterrab), Laura C. Kühle
TODO: Improve '_generate_cell_data' TODO: Improve '_generate_cell_data'
TODO: Extract normalization (At all? Over smooth and troubled separately?) TODO: Extract normalization (Combine smooth and troubled before normalizing)
TODO: Discontinue data splitting -> Done TODO: Discontinue data splitting -> Done
TODO: Improve verbose output TODO: Improve verbose output
...@@ -59,9 +59,9 @@ class TrainingDataGenerator(object): ...@@ -59,9 +59,9 @@ class TrainingDataGenerator(object):
+ str((num_samples*(1-self._balance))/1000)\ + str((num_samples*(1-self._balance))/1000)\
+ 'k__' + normalize_string + 'normalized.npy' + 'k__' + normalize_string + 'normalized.npy'
input_name = self._data_dir + '/training_input' + name input_name = self._data_dir + '/training_input.npy' # + name
np.save(input_name, data[0]) np.save(input_name, data[0])
output_name = self._data_dir + '/training_output' + name output_name = self._data_dir + '/training_output.npy' # + name
np.save(output_name, data[1]) np.save(output_name, data[1])
def _calculate_data_set(self, num_samples, normalize): def _calculate_data_set(self, num_samples, normalize):
...@@ -176,26 +176,9 @@ class TrainingDataGenerator(object): ...@@ -176,26 +176,9 @@ class TrainingDataGenerator(object):
# Get Training/Validation Datasets # Get Training/Validation Datasets
np.random.seed(1234) np.random.seed(1234)
# generator = TrainingDataGenerator(functions, left_bound=boundary[0], right_bound=boundary[1])
boundary = [-1, 1]
functions = [{'function': Initial_Condition.Sine(boundary[0], boundary[1], {}),
'config': {'factor': 2}},
{'function': Initial_Condition.Linear(boundary[0], boundary[1], {}),
'config': {}},
{'function': Initial_Condition.Polynomial(boundary[0], boundary[1], {}),
'config': {}},
{'function': Initial_Condition.Continuous(boundary[0], boundary[1], {}),
'config': {}},
{'function': Initial_Condition.LinearAbsolut(boundary[0], boundary[1], {}),
'config': {}},
{'function': Initial_Condition.HeavisideOneSided(boundary[0], boundary[1], {}),
'config': {}},
{'function': Initial_Condition.HeavisideTwoSided(boundary[0], boundary[1], {}),
'config': {'adjustment': 0}}]
generator = TrainingDataGenerator(functions, left_bound=boundary[0], right_bound=boundary[1])
# generator = TrainingDataGenerator(functions, left_bound=boundary[0], right_bound=boundary[1]) # generator = TrainingDataGenerator(functions, left_bound=boundary[0], right_bound=boundary[1])
sample_number = 1000 sample_number = 1000
data_1 = generator.build_training_data(sample_number, 0) # data_1 = generator.build_training_data(sample_number, 0)
# data_2 = generator.build_training_data(sample_number, 1) # data_2 = generator.build_training_data(sample_number, 1)
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
TODO: Improve 'epoch_training()' -> Done TODO: Improve 'epoch_training()' -> Done
TODO: Add ANN classification from Soraya TODO: Add ANN classification from Soraya
TODO: Improve naming of training data/model (maybe different folders?; split training data later; TODO: Improve naming of training data/model (maybe different folders?; split training data later;
total amount + percentage?) total amount + percentage?) -> Done
TODO: Give option to compare multiple models TODO: Give option to compare multiple models
TODO: Use sklearn for classification TODO: Use sklearn for classification
TODO: Fix difference between accuracies (stems from rounding; choose higher value instead) TODO: Fix difference between accuracies (stems from rounding; choose higher value instead)
...@@ -15,6 +15,9 @@ TODO: Fix bug in valid_loss calculation -> Done ...@@ -15,6 +15,9 @@ TODO: Fix bug in valid_loss calculation -> Done
TODO: Replace multi-file set-up for training data -> Done TODO: Replace multi-file set-up for training data -> Done
TODO: Rework model testing TODO: Rework model testing
TODO: Limit batch size with regard to training data -> Done TODO: Limit batch size with regard to training data -> Done
TODO: Build Snakemake pipeline for model training -> Done
TODO: Clean up directories/naming
TODO: Add log to pipeline
""" """
import numpy as np import numpy as np
...@@ -69,8 +72,8 @@ class ModelTrainer(object): ...@@ -69,8 +72,8 @@ class ModelTrainer(object):
def _read_training_data(self, directory): def _read_training_data(self, directory):
# Get training dataset from saved file and map to Torch tensor and dataset # Get training dataset from saved file and map to Torch tensor and dataset
input_file = directory + '/training_input__' + self._data_file input_file = directory + '/training_input.npy' # + self._data_file
output_file = directory + '/training_output__' + self._data_file output_file = directory + '/training_output.npy' # + self._data_file
self._training_data = TensorDataset(*map(torch.tensor, (np.load(input_file), self._training_data = TensorDataset(*map(torch.tensor, (np.load(input_file),
np.load(output_file)))) np.load(output_file))))
...@@ -188,17 +191,16 @@ class ModelTrainer(object): ...@@ -188,17 +191,16 @@ class ModelTrainer(object):
def save_model(self): def save_model(self):
# Saving Model # Saving Model
data_name = self._data_file.split('.npy')[0] # data_name = self._data_file.split('.npy')[0]
path = self._model.get_name() + '__' + self._optimizer.__class__.__name__ + '_' \ # path = self._model.get_name() + '__' + self._optimizer.__class__.__name__ + '_' \
+ str(self._learning_rate) + '__' + self._loss_function.__class__.__name__ + '__' \ # + str(self._learning_rate) + '__' + self._loss_function.__class__.__name__ + '.pt'
+ data_name + '.pt'
# Set paths for plot files if not existing already # Set paths for plot files if not existing already
if not os.path.exists(self._model_dir): if not os.path.exists(self._model_dir):
os.makedirs(self._model_dir) os.makedirs(self._model_dir)
torch.save(self._model.state_dict(), self._model_dir + '/Model__' + path) torch.save(self._model.state_dict(), self._model_dir + '/model.pt') # __' + path)
torch.save(self._validation_loss, self._model_dir + '/Loss__' + path) torch.save(self._validation_loss, self._model_dir + '/loss.pt') # __' + path)
def _classify(self): def _classify(self):
pass pass
...@@ -207,7 +209,7 @@ class ModelTrainer(object): ...@@ -207,7 +209,7 @@ class ModelTrainer(object):
# Loss Functions: BCELoss, BCEWithLogitsLoss, # Loss Functions: BCELoss, BCEWithLogitsLoss,
# CrossEntropyLoss (not working), MSELoss (with reduction='sum') # CrossEntropyLoss (not working), MSELoss (with reduction='sum')
# Optimizer: Adam, SGD # Optimizer: Adam, SGD
trainer = ModelTrainer({'num_epochs': 1000}) # trainer = ModelTrainer({'num_epochs': 1000})
trainer.epoch_training() # trainer.epoch_training()
# trainer.test_model() # trainer.test_model()
trainer.save_model() # trainer.save_model()
configfile: 'config.yaml'
import ANN_Data_Generator, Initial_Condition, ANN_Training
def replace_none(list):
return {} if list is None else list
DIR = config['data_directory']
rule all:
input:
DIR+'/model.pt'
rule generate_data:
output:
DIR+'/training_input.npy',
DIR+'/training_output.npy'
params:
left_bound = config['left_boundary'],
right_bound = config['right_boundary'],
balance = config['smooth_troubled_balance'],
stencil_length = config['stencil_length'],
sample_number = config['sample_number'],
functions = expand('{FUNCTION}', FUNCTION=config['functions'])
log:
DIR+'/log/generate_data.log'
run:
initial_conditions = []
for function in params.functions:
initial_conditions.append({
'function': getattr(Initial_Condition, function)(
params.left_bound, params.right_bound, {}),
'config': replace_none(config['functions'][function])})
generator = ANN_Data_Generator.TrainingDataGenerator(initial_conditions,
left_bound=params.left_bound, right_bound=params.right_bound, balance=params.balance,
stencil_length=params.stencil_length, directory=DIR)
data = generator.build_training_data(params.sample_number, 1)
# print(data[0])
rule train_model:
input:
DIR+'/training_input.npy',
DIR+'/training_output.npy'
params:
num_epochs = config['num_epochs'],
threshold = config['threshold'],
batch_size = config['batch_size'],
model = config['model'],
model_config = replace_none(config['model_config']),
loss_function = config['loss_function'],
optimizer = config['optimizer']
log:
DIR+'/log/train_model.log'
output:
DIR+'/model.pt',
DIR+'/loss.pt'
run:
trainer= ANN_Training.ModelTrainer({'num_epochs': params.num_epochs, 'data_dir': DIR,
'model_dir': DIR, 'threshold': params.threshold,
'batch_size': params.batch_size, 'model': params.model,
'model_config': params.model_config,
'loss_function': params.loss_function,
'optimizer': params.optimizer})
trainer.epoch_training()
trainer.save_model()
\ No newline at end of file
data_directory: "Snakemake-Test"
# Parameter for Training Data Generation
sample_number: 100
left_boundary: -1
right_boundary: 1
smooth_troubled_balance: 0.5
stencil_length: 3
# Initial Conditions for Training Data
functions:
Sine:
factor: 2
Linear:
Polynomial:
Continuous:
LinearAbsolut:
HeavisideOneSided:
HeavisideTwoSided:
adjustment: 0
# Parameter for Model Training
num_epochs: 1000
threshold: 1.0e-5
batch_size: 500
model: ThreeLayerReLu
model_config:
loss_function: BCELoss
optimizer: Adam
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment