Source code for WPT_Transfer_Learning_2case

"""
Transfer Learning On Two Cases Using WPT
----------------------------------------

This fuction implement transfer learning by training a classifier on two different 
data sets and testing it on remaining two different data sets. Stickout lengths of 
each data set should be determined by user. It asks for file paths to training and test set files. 
It assumes that reconstructed wavelet packets and frequency domain features are available in the specified data file folder.
Algorithm performs classfication based on chosen classifier and returns the results in an array. 
It also prints the total elapsed time.  

"""
import time
import numpy as np
import scipy.io as sio
from scipy.stats import skew
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from matplotlib import rc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib
import os
matplotlib.rcParams.update({'font.size': 14})
rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=True)
#%%
[docs]def WPT_Transfer_Learning_2case(stickout_lengths,WPT_Level,Classifier):
    """
    :param str (stickout_lengths): 
       Stickout length for the training and test set in a np.array([]) format.First two stickout length are considered as training set data and the remaining ones are test set data.
       
       * if stickout length is 2 inch, '2'
       * if stickout length is 2.5 inch, '2p5'
       * if stickout length is 3.5 inch, '3p5'
       * if stickout length is 4.5 inch, '4p5'
  
    :param int (WPT_Level): 
        Level of Wavelet Packet Decomposition
    
    :param str (Classifier): 
        Classifier defined by user
       
       * Support Vector Machine: 'SVC'
       * Logistic Regression: 'LR'
       * Random Forest Classification: 'RF'
       * Gradient Boosting: 'GB'
    
    :Returns:

        :results:
            (np.array([])) Classification results for training and test set for all combination of ranked features and devition for both set.
        
            * first column: mean accuracies for training set
            * second column: deviation for training set accuracies
            * third column: mean accuracies for test set
            * fourth column: deviation for test set accuracies
        
        :time:
            (str) Elapsed time during feature matrix generation and classification
    
    :Example:
    
        .. doctest::
                       
           >>> from WPT_Transfer_Learning_2case import WPT_Transfer_Learning_2case
          
           #parameters
            
           >>> stickout_lengths = ['2','2p5','3p5','4p5']
           >>> WPT_Level=4
           >>> Classifier='SVC'
         
           >>> results = WPT_Transfer_Learning_2case(stickout_lengths, 
           >>>                                       WPT_Level, Classifier)     
           Enter the path of first training set data files:
           >>> D\...\cutting_tests_processed\data_2inch_stickout
           Enter the path of second training set data files:
           >>> D\...\cutting_tests_processed\data_2p5inch_stickout
           Enter the path of first test set data files:
           >>> D\...\cutting_tests_processed\data_3p5inch_stickout   
           Enter the path of second test set data files:
           >>> D\...\cutting_tests_processed\data_4p5inch_stickout
    """    
   
    #%% get the path to data files from user
    
    user_input_train1 = input("Enter the path of first training set data files: ")
    
    assert os.path.exists(user_input_train1), "Specified file does not exist at, "+str(user_input_train1)
    
    user_input_train2 = input("Enter the path of second training set data files: ")
    
    assert os.path.exists(user_input_train2), "Specified file does not exist at, "+str(user_input_train2)
    
    user_input_test1 = input("Enter the path of first test set data files: ")
    
    assert os.path.exists(user_input_test1), "Specified file does not exist at, "+str(user_input_test1)
    
    user_input_test2 = input("Enter the path of second test set data files: ")
    
    assert os.path.exists(user_input_test2), "Specified file does not exist at, "+str(user_input_test2)
    
    folderToLoad1 = os.path.join(user_input_train1)
    folderToLoad2 = os.path.join(user_input_train2)
    folderToLoad3 = os.path.join(user_input_test1)
    folderToLoad4 = os.path.join(user_input_test2)
    
    #%% start timer
    start2 = time.time()
    
    #%% load data files compute time domain features and combine them with frequency domain features computed in Matlab previously
    n_feature=14
    # import the list including the name of the time series of the chosen case
    label={}
    feature_mat={}
    for i in range(0,4):
        if i==0:
            folderToLoad = folderToLoad1
        elif i==1:
            folderToLoad = folderToLoad2 
        elif i==2:
            folderToLoad = folderToLoad3
        elif i==3:
            folderToLoad = folderToLoad4   
            
        file_name = 'time_series_name_'+stickout_lengths[i]+'inch.txt'
#        folderToLoad_name = 'folderToLoad'+str(i+1)
#        exec("folderToLoad = %s" %(folderToLoad_name),globals())
        file_path = os.path.join(folderToLoad, file_name)
        f = open(file_path,'r',newline='\n')
        
        #save the time series name into a list
        namets = []
        for line in f:
            names = line.split("\r\n")
            namets.append(names[0])
    
        #import the classification labels
        label_file_name = stickout_lengths[i]+'_inch_Labels_2Class.npy'
        label_path = os.path.join(folderToLoad, label_file_name)
        label[i] = np.load(label_path) 
      
    
        # load datasets
        numberofcase = len(namets)
        feature_mat[i]=np.zeros((numberofcase,10))
                
        for j in range(numberofcase):
            nameofdata = 'WPT_Level%s_Recon_%sinch_%s' %(str(WPT_Level),stickout_lengths[i],namets[j])
            pathofdata = os.path.join(folderToLoad, nameofdata)
            ts = sio.loadmat(pathofdata)
            ts= ts["recon"]
            
            feature_mat[i][j,0] = np.average(ts)
            feature_mat[i][j,1] = np.std(ts)
            feature_mat[i][j,2] = np.sqrt(np.mean(ts**2))
            feature_mat[i][j,3] = max(abs(ts))
            feature_mat[i][j,4] = skew(ts)
            L=len(ts)
            feature_mat[i][j,5] = sum(np.power(ts-feature_mat[i][j,0],4)) / ((L-1)*np.power(feature_mat[i][j,1],4))
            feature_mat[i][j,6] = feature_mat[i][j,3]/feature_mat[i][j,2]
            feature_mat[i][j,7] = feature_mat[i][j,3]/np.power((np.average(np.sqrt(abs(ts)))),2)
            feature_mat[i][j,8] = feature_mat[i][j,2]/(np.average((abs(ts))))
            feature_mat[i][j,9] = feature_mat[i][j,3]/(np.average((abs(ts))))
            
        freq_feature_file_name = 'WPT_Level%d_Freq_Features_%sinch.mat'%(WPT_Level,stickout_lengths[i])
        file_path_Ff = os.path.join(folderToLoad, freq_feature_file_name)        
        freq_features = sio.loadmat(file_path_Ff)
        freq_features = freq_features['Freq_Features']
         
        #concatanate the frequency and time domain features 
        feature_mat[i]=np.concatenate((feature_mat[i], freq_features),axis = 1)
        
        
    #concatanate the frequency and time domain features 
    featuremat_training  =  np.concatenate((feature_mat[0], feature_mat[1]),axis = 0)
    featuremat_test  = np.concatenate((feature_mat[2], feature_mat[3]),axis = 0)
    
    # labels of training set and test sets
    label_train = np.concatenate((label[0],label[1]),axis=0)
    label_test = np.concatenate((label[2],label[3]),axis=0)
    
    #%%
    #creating train, test, accuracy, meanscore and deviation matrices
    accuracy1 = np.zeros((n_feature,10))
    accuracy2 = np.zeros((n_feature,10))
    deviation1 = np.zeros((n_feature,1))
    deviation2 = np.zeros((n_feature,1))
    meanscore1 = np.zeros((n_feature,1))
    meanscore2 = np.zeros((n_feature,1))
    duration1 = np.zeros((n_feature,10))
    meanduration = np.zeros((n_feature,1))
    
    #repeat the procedure ten times 
    Rank=[]
    RankedList=[]
    for o in range(0,10):
        
        #split into test and train set
        F_Training_Train,F_Training_Test,Label_Training_Train,Label_Training_Test= train_test_split(featuremat_training, label_train, test_size=0.33)
        F_Test_Train,F_Test_Test,Label_Test_Train,Label_Test_Test= train_test_split(featuremat_test,label_test, test_size=0.70)
        
        #classifier
        if Classifier=='SVC':
            clf = SVC(kernel='linear')
        elif Classifier=='LR':
            clf = LogisticRegression()
        elif Classifier=='RF':
            clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
        elif Classifier=='GB':
            clf = GradientBoostingClassifier()
    
        #recursive feature elimination
        selector = RFE(clf, 1, step=1)
        Label_train=np.ravel(Label_Training_Train)
        Label_test =np.ravel(Label_Test_Test)
        selector = selector.fit(F_Training_Train, Label_train)
        rank = selector.ranking_
        Rank.append(rank)
        rank = np.asarray(rank)
        
        #create a list that contains index number of ranked features
        rankedlist = np.zeros((14,1))
            
        #finding index of the ranked features and creating new training and test sets with respect to this ranking
        for m in range (1,15):
            k=np.where(rank==m)
            rankedlist[m-1]=k[0][0]
            F_Training_Train[:,m-1] = F_Training_Train[:,int(rankedlist[m-1][0])]
            F_Test_Test[:,m-1] = F_Test_Test[:,int(rankedlist[m-1][0])] 
        RankedList.append(rankedlist)
        
        #trying various combinations of ranked features such as ([1],[1,2],[1,2,3]...)
        for p in range(0,14): 
            start1 = time.time()
            clf.fit(F_Training_Train[:,0:p+1],Label_train)
            score1=clf.score(F_Test_Test[:,0:p+1],Label_test)
            score2=clf.score(F_Training_Train[:,0:p+1],Label_train)
            accuracy1[p,o]=score1
            accuracy2[p,o]=score2
            end1=time.time()
            duration1[p,o] = end1 - start1
    
    #computing mean score and deviation for each combination tried above        
    for n in range(0,14):
        deviation1[n,0]=np.std(accuracy1[n,:])
        deviation2[n,0]=np.std(accuracy2[n,:])
        meanscore1[n,0]=np.mean(accuracy1[n,:])
        meanscore2[n,0]=np.mean(accuracy2[n,:])
        meanduration[n,0]=np.mean(duration1[n,:])
        
    
    results = np.concatenate((meanscore1,deviation1,meanscore2,deviation2),axis=1)
    results = 100*results    
    
    #total duration for algorithm  
    end2 = time.time()
    duration2 = end2-start2
    
    
    return results,print('Total elapsed time: {}'.format(duration2)) ,featuremat_training, featuremat_test