lab/01-Data-Visualization/preprocessing.py [6:135]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
subprocess.check_call([sys.executable, "-m", "pip", "install", "pywavelets==1.1.1"])

import argparse
import os
import warnings

import pandas as pd
import numpy as np
import pywt
import math
import glob
from datetime import datetime

def create_dataset(X, time_steps=1, step=1):
    """
    Encode the timeseries dataset into a
    multidimentional tensor in the format: num_features x step x step.
    It uses a time window approach to slide on 'step' right in the timeseries.    
    """
    Xs = []
    for i in range(0, len(X) - time_steps, step):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
    return np.array(Xs)

def euler_from_quaternion(x, y, z, w):
    """
    Convert a quaternion into euler angles (roll, pitch, yaw)
    roll is rotation around x in radians (counterclockwise)
    pitch is rotation around y in radians (counterclockwise)
    yaw is rotation around z in radians (counterclockwise)
    """
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    roll_x = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    pitch_y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    yaw_z = math.atan2(t3, t4)

    return roll_x, pitch_y, yaw_z # in radians

def wavelet_denoise(data, wavelet, noise_sigma):
    '''Filter accelerometer data using wavelet denoising

    Modification of F. Blanco-Silva's code at: https://goo.gl/gOQwy5
    '''
    
    wavelet = pywt.Wavelet(wavelet)
    levels  = min(5, (np.floor(np.log2(data.shape[0]))).astype(int))
    
    # Francisco's code used wavedec2 for image data
    wavelet_coeffs = pywt.wavedec(data, wavelet, level=levels)
    threshold = noise_sigma*np.sqrt(2*np.log2(data.size))

    new_wavelet_coeffs = map(lambda x: pywt.threshold(x, threshold, mode='soft'), wavelet_coeffs)

    return pywt.waverec(list(new_wavelet_coeffs), wavelet)

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--interval', type=int, default=5)
    parser.add_argument('--time-steps', type=int, default=20)
    parser.add_argument('--step', type=int, default=10)
    parser.add_argument('--num-dataset-splits', type=int, default=25)
    args, _ = parser.parse_known_args()

    print('Received arguments {}'.format(args))

    INTERVAL = args.interval # seconds
    TIME_STEPS = args.time_steps * INTERVAL # Xms -> seg: Xms * Y
    STEP = args.step

    # selected the features
    features = ['roll', 'pitch', 'yaw', 'wind_speed_rps', 'rps', 'voltage']    
    input_data_base_path = '/opt/ml/processing/input'
    stats_output_base_path = '/opt/ml/processing/statistics'
    dataset_output_base_path = '/opt/ml/processing/train'

    # load the data file
    parser = lambda date: datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.%f+00:00')
    dfs = []
    for f in glob.glob(os.path.join(input_data_base_path, '*.gz')):
        dfs.append(pd.read_csv(f, compression='gzip', sep=',', low_memory=False, parse_dates=[ 'eventTime'], date_parser=parser))
    df = pd.concat(dfs)
    
    print('now converting quat to euler...')
    roll,pitch,yaw = [], [], []
    for idx, row in df.iterrows():
        r,p,y = euler_from_quaternion(row['qx'], row['qy'], row['qz'], row['qw'])
        roll.append(r)
        pitch.append(p)
        yaw.append(y)
    df['roll'] = roll
    df['pitch'] = pitch
    df['yaw'] = yaw

    df = df[features] # keep only the selected features
    
    # get the std for denoising
    raw_std = df.std()
    # denoise
    for f in features:
        df[f] = wavelet_denoise(df[f].values, 'db6', raw_std[f])

    # normalize
    training_std = df.std()
    training_mean = df.mean()
    df = (df - training_mean) / training_std
    
    # export the dataset statistics
    np.save(os.path.join(stats_output_base_path, 'raw_std.npy'), raw_std)
    np.save(os.path.join(stats_output_base_path, 'mean.npy'), training_mean)
    np.save(os.path.join(stats_output_base_path, 'std.npy'), training_std)
    
    # format the dataset
    n_cols = len(df.columns)
    X = create_dataset(df, TIME_STEPS, STEP)
    X = np.nan_to_num(X, copy=True, nan=0.0, posinf=None, neginf=None)
    X = np.transpose(X, (0, 2, 1)).reshape(X.shape[0], n_cols, 10, 10)

    ## We need to split the array in chunks of at most 5MB    
    for i,x in enumerate(np.array_split(X, args.num_dataset_splits)):
        np.save(os.path.join(dataset_output_base_path, 'wind_turbine_%02d.npy' % i), x)
    print("Number of training samples:", len(df))
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


lab/02-Training/preprocessing.py [6:135]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
subprocess.check_call([sys.executable, "-m", "pip", "install", "pywavelets==1.1.1"])

import argparse
import os
import warnings

import pandas as pd
import numpy as np
import pywt
import math
import glob
from datetime import datetime

def create_dataset(X, time_steps=1, step=1):
    """
    Encode the timeseries dataset into a
    multidimentional tensor in the format: num_features x step x step.
    It uses a time window approach to slide on 'step' right in the timeseries.    
    """
    Xs = []
    for i in range(0, len(X) - time_steps, step):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
    return np.array(Xs)

def euler_from_quaternion(x, y, z, w):
    """
    Convert a quaternion into euler angles (roll, pitch, yaw)
    roll is rotation around x in radians (counterclockwise)
    pitch is rotation around y in radians (counterclockwise)
    yaw is rotation around z in radians (counterclockwise)
    """
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    roll_x = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    pitch_y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    yaw_z = math.atan2(t3, t4)

    return roll_x, pitch_y, yaw_z # in radians

def wavelet_denoise(data, wavelet, noise_sigma):
    '''Filter accelerometer data using wavelet denoising

    Modification of F. Blanco-Silva's code at: https://goo.gl/gOQwy5
    '''
    
    wavelet = pywt.Wavelet(wavelet)
    levels  = min(5, (np.floor(np.log2(data.shape[0]))).astype(int))
    
    # Francisco's code used wavedec2 for image data
    wavelet_coeffs = pywt.wavedec(data, wavelet, level=levels)
    threshold = noise_sigma*np.sqrt(2*np.log2(data.size))

    new_wavelet_coeffs = map(lambda x: pywt.threshold(x, threshold, mode='soft'), wavelet_coeffs)

    return pywt.waverec(list(new_wavelet_coeffs), wavelet)

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--interval', type=int, default=5)
    parser.add_argument('--time-steps', type=int, default=20)
    parser.add_argument('--step', type=int, default=10)
    parser.add_argument('--num-dataset-splits', type=int, default=25)
    args, _ = parser.parse_known_args()

    print('Received arguments {}'.format(args))

    INTERVAL = args.interval # seconds
    TIME_STEPS = args.time_steps * INTERVAL # Xms -> seg: Xms * Y
    STEP = args.step

    # selected the features
    features = ['roll', 'pitch', 'yaw', 'wind_speed_rps', 'rps', 'voltage']    
    input_data_base_path = '/opt/ml/processing/input'
    stats_output_base_path = '/opt/ml/processing/statistics'
    dataset_output_base_path = '/opt/ml/processing/train'

    # load the data file
    parser = lambda date: datetime.strptime(date, '%Y-%m-%dT%H:%M:%S.%f+00:00')
    dfs = []
    for f in glob.glob(os.path.join(input_data_base_path, '*.gz')):
        dfs.append(pd.read_csv(f, compression='gzip', sep=',', low_memory=False, parse_dates=[ 'eventTime'], date_parser=parser))
    df = pd.concat(dfs)
    
    print('now converting quat to euler...')
    roll,pitch,yaw = [], [], []
    for idx, row in df.iterrows():
        r,p,y = euler_from_quaternion(row['qx'], row['qy'], row['qz'], row['qw'])
        roll.append(r)
        pitch.append(p)
        yaw.append(y)
    df['roll'] = roll
    df['pitch'] = pitch
    df['yaw'] = yaw

    df = df[features] # keep only the selected features
    
    # get the std for denoising
    raw_std = df.std()
    # denoise
    for f in features:
        df[f] = wavelet_denoise(df[f].values, 'db6', raw_std[f])

    # normalize
    training_std = df.std()
    training_mean = df.mean()
    df = (df - training_mean) / training_std
    
    # export the dataset statistics
    np.save(os.path.join(stats_output_base_path, 'raw_std.npy'), raw_std)
    np.save(os.path.join(stats_output_base_path, 'mean.npy'), training_mean)
    np.save(os.path.join(stats_output_base_path, 'std.npy'), training_std)
    
    # format the dataset
    n_cols = len(df.columns)
    X = create_dataset(df, TIME_STEPS, STEP)
    X = np.nan_to_num(X, copy=True, nan=0.0, posinf=None, neginf=None)
    X = np.transpose(X, (0, 2, 1)).reshape(X.shape[0], n_cols, 10, 10)

    ## We need to split the array in chunks of at most 5MB    
    for i,x in enumerate(np.array_split(X, args.num_dataset_splits)):
        np.save(os.path.join(dataset_output_base_path, 'wind_turbine_%02d.npy' % i), x)
    print("Number of training samples:", len(df))
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -