import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression


index_names = ['engine_num', 'time_cycles']
setting_names = ['set_1', 'set_2', 'set_3']
sensor_names = ['s_{}'.format(i) for i in range(1,25)]  # preparing the column headings
col_names = index_names + setting_names + sensor_names

df= pd.read_csv('train_FD001.txt',sep=' ',names=col_names, index_col=0) 
df.dropna(how='all', axis=1, inplace=True) # conditioning the dataframe by dropping all the null columns

df


# initial exploting of the datasets to check for null values
print(df.shape)
df.isnull().sum()

(20631, 25)

time_cycles    0
set_1          0
set_2          0
set_3          0
s_1            0
s_2            0
s_3            0
s_4            0
s_5            0
s_6            0
s_7            0
s_8            0
s_9            0
s_10           0
s_11           0
s_12           0
s_13           0
s_14           0
s_15           0
s_16           0
s_17           0
s_18           0
s_19           0
s_20           0
s_21           0
dtype: int64


print(df.shape)

(20631, 25)


EOL = []
for i in df.index:
    EOL.append(df.loc[i]['time_cycles'].max())
df['EOL'] = EOL
df['LR'] = df['time_cycles']/df['EOL']
df['RUL'] = df['EOL'] - df['time_cycles']
df.drop('EOL',axis=1,inplace=True)
df


# fig, ax = plt.subplots(nrows = df.shape[1],ncols = 1,figsize=(25,45))

# for i in range(0,len(df.columns)):
#     sns.lineplot(data=df,x =df['LR'],y=df[df.columns[i]],ax=ax[i])


# fig.savefig('DF1_variables.png',format='png')


plt.figure(figsize=(25,20))
sns.heatmap(df.corr(),annot=True , mask=np.triu(df.corr()))

<AxesSubplot:>


# considering all the engines
static_var = df.describe().loc['std'][df.describe().loc['std']<0.00001]
static_var

set_3    0.000000e+00
s_1      6.537152e-11
s_5      3.394700e-12
s_10     4.660829e-13
s_16     1.556432e-14
s_18     0.000000e+00
s_19     0.000000e+00
Name: std, dtype: float64


df = df.drop(static_var.index,axis=1)
df


plt.figure(figsize=(25,20))
sns.heatmap(df.corr(), annot=True, cmap = 'coolwarm', mask = np.triu(df.corr()) )
plt.title('correlation for the complete dataset')

Text(0.5, 1.0, 'correlation for the complete dataset')


def plot_corr(df,engine_num):
    tdf = df.loc[engine_num]
    fig = plt.figure(figsize=(25,20))
    sns.heatmap(tdf.corr(), annot=True, cmap='coolwarm', mask = np.triu(tdf.corr()))
    plt.title('correlation matrix for engine number: {}'.format(engine_num))


plot_corr(df,75)


col_list = []
eng_list = []
for i in df.index.unique():
    tdf = df.loc[i]
    for j in tdf.corr().columns:
       if tdf.corr()[j].isnull().sum() == tdf.shape[1]:
           col_list.append(j)
           eng_list.append(i)

a = {'Engine list' : eng_list, 'column': col_list}
a = pd.DataFrame(a)
a = a.set_index('Engine list')
a


df = df.drop('s_6', axis=1)
df


def find_corr_pairs(corr,thrsh):
    
    """
    find high correlation column pairs in df 
    ======================================
    input: 
    corr - (df)- correlation matrix generated by pandas
    thrsh - (float) threshold value to consider correlation as high so that it is included in the output 
    output:
    high_corr_pairs - (list) list of tuples of the two-column names and their correlation. corr> thrsh
    """
    high_corr_pairs = []
    # same as input 'corr' but the upper -triangle half of the matrix is zeros ( for convenience only) 
    corr_diag = pd.DataFrame(np.tril(corr.values), columns=corr.columns, index = corr.index)

    # check  the correlation between every pair of columns in the corr and keeps the high ones
    for col_num , col in enumerate(corr_diag):
        col_corr=corr_diag[col].iloc[col_num+1:] # this slicing ensures ignoring self_corr and duplicates due to symmetry
        # bool mask for pairs with high corr with col
        mask_pairs = col_corr.apply(lambda x: abs(x))>thrsh 
        idx_pairs=col_corr[mask_pairs].index

        # create list of high corr pairs
        for idx , corr in zip(idx_pairs,col_corr[mask_pairs].values):
            high_corr_pairs.append((col, idx, corr))
    
    return high_corr_pairs


engine_num=5
corr_all=df.drop('time_cycles',axis=1).corr()# linear correlation between variables for all engines
corr_num=df.loc[engine_num].drop('time_cycles',axis=1).corr() # linear correlation between variables for engine [engine_num]


corr_pairs=find_corr_pairs(corr_all,0.9)
for c in corr_pairs:
    print(c)

('s_9', 's_14', 0.9631566003059776)
('LR', 'RUL', -0.9084599112112265)


print(df.columns)
df = df.drop('s_14',axis=1)
print(df.columns)

Index(['time_cycles', 'set_1', 'set_2', 's_2', 's_3', 's_4', 's_7', 's_8',
       's_9', 's_11', 's_12', 's_13', 's_14', 's_15', 's_17', 's_20', 's_21',
       'LR', 'RUL'],
      dtype='object')
Index(['time_cycles', 'set_1', 'set_2', 's_2', 's_3', 's_4', 's_7', 's_8',
       's_9', 's_11', 's_12', 's_13', 's_15', 's_17', 's_20', 's_21', 'LR',
       'RUL'],
      dtype='object')


def plot_ts_all(df):
    
    # prepare the dataframe for plotting
    ts = df.copy() # df for the needed engine
    ts.drop(labels=['time_cycles'],axis=1,inplace=True)
    
    cols = ts.columns
    # plotting
    fig, axes = plt.subplots(len(cols)-2, 1, figsize=(25,20))
    for col, ax in zip(cols, axes):
        if col == 'time_cycles':
            continue
            
        fontdict = {'fontsize': 14}
        ax.set_title(col,loc='left',fontdict=fontdict)
        for engine_id in ts.index.unique():
            time = df.loc[engine_id,'time_cycles']
            ax.plot(time,ts.loc[engine_id,col],label=col)

    # figure title    
    fig.suptitle('All Engines Time Series \n each line is different engine resposne\n x-axis is number of cycles with the rightmost point being the last cycle for all engines')
    return fig


# fig=plot_ts_all(df)
# plt.tight_layout()


df = df.drop(['set_1','set_2'],axis=1)
df


# get all sensors
raw_columns = df.columns[1:-2]
raw_sensors = df[raw_columns] # as numpy array


standard_scale = StandardScaler()
standard_sensors = standard_scale.fit_transform(raw_sensors)


lin_model =LinearRegression()


engine_num=3
x = df.loc[engine_num,'RUL'].values


row_sl=df.index.get_loc(engine_num) # row slice to get numpy index 
row_sl

slice(479, 658, None)


y=standard_sensors[row_sl] # sensor values for the specifc engine
y

array([[-1.2817616 , -1.03133545, -1.20036862, ..., -1.42740201,
         0.79521805,  0.28448163],
       [-0.04186365, -0.56811582, -1.14592651, ..., -2.07309423,
         1.40382021,  1.35978769],
       [-1.22176654, -1.59404944, -0.8703828 , ..., -0.13601757,
         1.18251034,  0.24475813],
       ...,
       [ 1.99796847,  1.74308918,  1.61839917, ...,  2.44675132,
        -2.46910261, -2.36590761],
       [ 1.95797176,  2.6059173 ,  2.40169886, ...,  3.73813576,
        -1.36255323, -1.45411458],
       [ 1.65799645,  2.32863794,  2.14393215, ...,  3.09244354,
        -2.3031202 , -3.0809307 ]])


lin_model.fit(x.reshape(-1, 1),y)

LinearRegression()


lin_model.coef_[:,0].shape

(13,)


lin_model.score(x.reshape(-1, 1),y)

0.5027699485857744


y_hat = lin_model.predict(x.reshape(-1, 1))


# plotting
time = df.loc[engine_num,'time_cycles']
cols = df.columns[1:-2]
fig, axes = plt.subplots(len(cols), 1, figsize=(25,20))
for col, ax in zip(range(standard_sensors.shape[1]), axes):
    ax.plot(time,standard_sensors[row_sl,col],label=df.columns[col])
    ax.plot(time,y_hat[:,col],label='trend')
    ax.legend(loc=2)


def lin_slopes(sensors,df,engine_num):
    """
    gives slopes of a teh tred lines for each sesnor 
    =================================================
    input: 
    sensors - (ndarray) numpy array of standardized signals ( rows: -RUL columns, various signals)
    engine_num - (int) engine number to selector
    df - (df) data frame of data
    output: 
    slopes -(ndarray) numpy array of slopes rows: slope of each signal linear trend line
    """
    model = LinearRegression()
    x = df.loc[engine_num,'RUL'].values
    row_name=df.loc[engine_num].iloc[-1].name
    row_sl=df.index.get_loc(row_name) # row slice to get numpy index 
    y=sensors[row_sl] # sensor values for the specifc engine
    model.fit(x.reshape(-1, 1),y)
    slopes=model.coef_[:,0]
    return slopes


 model = LinearRegression()
 model.fit(x.reshape(-1, 1),y)
 model.coef_[:,0]

array([-0.01213049, -0.01402176, -0.01457614,  0.01150702, -0.00566421,
       -0.02590653, -0.01410581,  0.0115465 , -0.00522121, -0.01418821,
       -0.01370743,  0.01268412,  0.01317799])


# finding slopes for all engines
engines=df.index.unique().values
slopes = np.empty((standard_sensors.shape[1],len(engines)))
for i,engine in enumerate(engines):
    slopes[:,i] = lin_slopes(standard_sensors,df,engine)


# creating slopes_df
slopes_df = pd.DataFrame(slopes.T,index=engines,columns =raw_columns )


# index of highest to lowest abs(slope) for each signal 
slope_order_idx=np.argsort(np.abs(slopes.mean(axis=1)))[::-1]
slope_order_idx

array([ 6,  2,  7,  3,  9, 12, 11, 10,  0,  1,  8,  4,  5], dtype=int64)


slope_order_idx

array([ 6,  2,  7,  3,  9, 12, 11, 10,  0,  1,  8,  4,  5], dtype=int64)


pca = PCA()
pca.fit(standard_sensors)

PCA()


100*pca.explained_variance_ratio_

array([68.92499095,  9.10984975,  3.17575244,  2.82500121,  2.6682035 ,
        2.35450554,  2.22533398,  1.9105238 ,  1.57960165,  1.42257128,
        1.35704776,  1.32673415,  1.11988399])


pd.Series(pca.explained_variance_ratio_.cumsum()).plot(figsize=(12,8))

<AxesSubplot:>


fig, ax1 = plt.subplots(figsize = (12,6))
ax2 = ax1.twinx()

# generate the plots
ax1.plot(pca.explained_variance_ratio_, '--*', ms=8);
ax2.plot(pca.explained_variance_ratio_.cumsum(), '--go');

#setup axis titles
ax1.set_xlabel('Principle Component', fontsize=14)
ax1.set_ylabel('Variance Ratio', color= 'b', fontsize=14)
ax2.set_ylabel('Cumulative Variance Ratio', color='g', fontsize=14)

plt.grid()
plt.title('PCA-All features',fontsize=14,fontweight='bold')
plt.xlim(0,pca.n_components_-1)

plt.show()


num_high_slopes = 6
pca_high_n_components=5
sensors_high_trend=standard_sensors[:,slope_order_idx[0:num_high_slopes]]
pca_high = PCA(pca_high_n_components,whiten=True)
pca_high.fit(sensors_high_trend)

PCA(n_components=5, whiten=True)


pca_high.explained_variance_ratio_

array([0.81660323, 0.04968993, 0.0442961 , 0.03439886, 0.03014873])


sensors_pca=pca_high.transform(sensors_high_trend)
sensors_pca.shape

(20631, 5)


# create a dictionary with engine slices 

engines=df.index.unique() # engine numbers
engine_slices = dict()# key is engine number, value is a slice that gives numpy index for the data that pertains to an engine  

for i,engine_num in enumerate(engines):
    row_name=df.loc[engine_num].iloc[-1].name
    row_sl=df.index.get_loc(row_name) # row slice to get numpy index 
    engine_slices[engine_num]=row_sl


# ax = plt.subplot(figsize=(15,12))
fig=plt.figure(figsize=(15,12))
ax=sns.histplot(df['RUL'],kde=True)
ax.set_title('Distribution of RUL for all engines',{'fontsize':16});
ax.set_xlabel('RUL');


# conditions and thersholds for high HI and low HI
RUL_high = 300 # threshold of number of cycles that makes us consider the engine started at perfect health 
RUL_low = 5  # threshold of the number of cycles below which engine is considered has failed l ( for purposes of modeling and getting data)  
RUL_df = df['RUL'].values


# Gather data and prepare it for HI fusing and modeling

# find engines with high (low) HI at their initial (final) cycles
idx_high_HI = [RUL_df<=RUL_high][0]
idx_low_HI  = [RUL_df>RUL_low][0]

# data for to make fuse sensor model (HI creation)
high_HI_data= sensors_pca[idx_high_HI,:]
low_HI_data= sensors_pca[idx_low_HI,:]
# concatenate high HI and Low HI data
X_HI = np.concatenate((high_HI_data,low_HI_data),axis=0)

# target for the fused signal [ just 0 or 1 for failed ans healthy]
y_one = np.ones(high_HI_data.shape[0])
y_zero = np.zeros(low_HI_data.shape[0])
# concatenate high HI and Low HI target
y_HI = np.concatenate((y_one,y_zero),axis=0)


idx_high_HI

array([ True,  True,  True, ...,  True,  True,  True])


# fit a model to get fused sensor 

# linear regression
HI_linear = LinearRegression()
HI_linear.fit(X_HI,y_HI)

# logistic regression
HI_logistic = LogisticRegression(solver='liblinear')
HI_logistic.fit(X_HI,y_HI)

LogisticRegression(solver='liblinear')


# let's see how this HI varies with RUL for a specific engine 

# get data for and engine
engine_num=50
engine_sensors=sensors_pca[engine_slices[engine_num],:]
RUL_engine = df.loc[engine_num]['RUL']

# predict the HI
HI_pred_lin = HI_linear.predict(engine_sensors)
HI_pred_log = HI_logistic.predict_proba(engine_sensors)[:,1]

# plot fused HI signal for linear and logistic models \
fig=plt.figure(figsize=(25,10))
plt.plot(RUL_engine,HI_pred_lin,label='Linear')
plt.plot(RUL_engine,HI_pred_log,label='Logistic')
plt.title('Health Index (HI)')
plt.xlabel('RUL [cycles]')
plt.ylabel('HI')
plt.legend();


fig=plt.figure(figsize=(15,12))
for engine_num in engines:
    
    engine_sensors=sensors_pca[engine_slices[engine_num],:]
    RUL_engine = df.loc[engine_num]['RUL']

    # predict the HI
    HI_pred_lin = HI_linear.predict(engine_sensors)
#     HI_pred_log = HI_logistic.predict_proba(engine_sensors)[:,1]
    
    plt.scatter(RUL_engine,HI_pred_lin,label=engine_num,s=25)
#     plt.scatter(RUL_engine,HI_pred_log,label='Logistic')
    plt.title('Health Index (HI)')
    plt.xlabel('RUL [cycles]')
    plt.ylabel('HI')
    plt.tight_layout()
#     plt.legend();


fig=plt.figure(figsize=(25,8))
for engine_num in engines:
    
    engine_sensors=sensors_pca[engine_slices[engine_num],:]
    RUL_engine = df.loc[engine_num]['time_cycles']

    # predict the HI
    HI_pred_lin = HI_linear.predict(engine_sensors)
#     HI_pred_log = HI_logistic.predict_proba(engine_sensors)[:,1]
    
    plt.scatter(RUL_engine,HI_pred_lin,label=engine_num,s=25)
    # plt.scatter(RUL_engine,HI_pred_log,label='Logistic')
    plt.title('Health Index (HI)')
    plt.xlabel('Number of cycles')
    plt.ylabel('HI')
#     plt.legend();


lm = LinearRegression().fit(standard_sensors,df['RUL'])


from sklearn.model_selection import train_test_split


X = pca.transform(standard_sensors)
y = df['RUL']


X.shape

(20631, 13)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)


print(X.shape)
print(X_train.shape)
print(X_test.shape)

(20631, 13)
(18567, 13)
(2064, 13)


lm1 = LinearRegression().fit(X_train,y_train)


y_predict = lm1.predict(X_test)


from sklearn.metrics import mean_absolute_error,mean_squared_error


MAE = mean_absolute_error(y_test,y_predict)
MSE = mean_squared_error(y_test,y_predict)
print('MAE: \t{}'.format(MAE))
print('MSE: \t{}'.format(MSE))
print('RMSE:\t{}'.format(np.sqrt(MSE)))
print('MEAN:\t {}'.format(y_test.mean()))
print('% error: {}'.format(100*np.sqrt(MSE)/y_test.mean()))

MAE: 	33.99844042924541
MSE: 	1922.026047559994
RMSE:	43.84091750362889
MEAN:	 108.65552325581395
% error: 40.34854022138543


from sklearn.ensemble import RandomForestRegressor


rfr = RandomForestRegressor(n_estimators=100,max_depth=5, min_samples_split=2, min_samples_leaf=1).fit(X_train,y_train)
rfr_y_predict = rfr.predict(X_test)


rfr_MAE = mean_absolute_error(y_test,rfr_y_predict)
rfr_MSE = mean_squared_error(y_test,rfr_y_predict)
print('MAE: \t{}'.format(rfr_MAE))
print('MSE: \t{}'.format(rfr_MSE))
print('RMSE:\t{}'.format(np.sqrt(rfr_MSE)))
print('MEAN:\t {}'.format(y_test.mean()))
print('% error: {}'.format(100*np.sqrt(rfr_MSE)/y_test.mean()))

MAE: 	30.0488317865194
MSE: 	1687.8622036186655
RMSE:	41.08360017840045
MEAN:	 108.65552325581395
% error: 37.81087141025954


from sklearn.tree import DecisionTreeRegressor


dtr = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1).fit(X_train,y_train)
dtr_y_predict = dtr.predict(X_test)


dtr_MAE = mean_absolute_error(y_test,dtr_y_predict)
dtr_MSE = mean_squared_error(y_test,dtr_y_predict)
print('MAE: \t{}'.format(dtr_MAE))
print('MSE: \t{}'.format(dtr_MSE))
print('RMSE:\t{}'.format(np.sqrt(dtr_MSE)))
print('MEAN:\t {}'.format(y_test.mean()))
print('% error: {}'.format(100*np.sqrt(dtr_MSE)/y_test.mean()))

MAE: 	30.508410819218746
MSE: 	1737.9833842332162
RMSE:	41.68912789005326
MEAN:	 108.65552325581395
% error: 38.368162649129346

	time_cycles	set_1	set_2	set_3	s_1	s_2	s_3	s_4	s_5	s_6	...	s_12	s_13	s_14	s_15	s_16	s_17	s_18	s_19	s_20	s_21
engine_num
1	1	-0.0007	-0.0004	100.0	518.67	641.82	1589.70	1400.60	14.62	21.61	...	521.66	2388.02	8138.62	8.4195	0.03	392	2388	100.0	39.06	23.4190
1	2	0.0019	-0.0003	100.0	518.67	642.15	1591.82	1403.14	14.62	21.61	...	522.28	2388.07	8131.49	8.4318	0.03	392	2388	100.0	39.00	23.4236
1	3	-0.0043	0.0003	100.0	518.67	642.35	1587.99	1404.20	14.62	21.61	...	522.42	2388.03	8133.23	8.4178	0.03	390	2388	100.0	38.95	23.3442
1	4	0.0007	0.0000	100.0	518.67	642.35	1582.79	1401.87	14.62	21.61	...	522.86	2388.08	8133.83	8.3682	0.03	392	2388	100.0	38.88	23.3739
1	5	-0.0019	-0.0002	100.0	518.67	642.37	1582.85	1406.22	14.62	21.61	...	522.19	2388.04	8133.80	8.4294	0.03	393	2388	100.0	38.90	23.4044
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	196	-0.0004	-0.0003	100.0	518.67	643.49	1597.98	1428.63	14.62	21.61	...	519.49	2388.26	8137.60	8.4956	0.03	397	2388	100.0	38.49	22.9735
100	197	-0.0016	-0.0005	100.0	518.67	643.54	1604.50	1433.58	14.62	21.61	...	519.68	2388.22	8136.50	8.5139	0.03	395	2388	100.0	38.30	23.1594
100	198	0.0004	0.0000	100.0	518.67	643.42	1602.46	1428.18	14.62	21.61	...	520.01	2388.24	8141.05	8.5646	0.03	398	2388	100.0	38.44	22.9333
100	199	-0.0011	0.0003	100.0	518.67	643.23	1605.26	1426.53	14.62	21.61	...	519.67	2388.23	8139.29	8.5389	0.03	395	2388	100.0	38.29	23.0640
100	200	-0.0032	-0.0005	100.0	518.67	643.85	1600.38	1432.14	14.62	21.61	...	519.30	2388.26	8137.33	8.5036	0.03	396	2388	100.0	38.37	23.0522

	time_cycles	set_1	set_2	set_3	s_1	s_2	s_3	s_4	s_5	s_6	...	s_14	s_15	s_16	s_17	s_18	s_19	s_20	s_21	LR	RUL
engine_num
1	1	-0.0007	-0.0004	100.0	518.67	641.82	1589.70	1400.60	14.62	21.61	...	8138.62	8.4195	0.03	392	2388	100.0	39.06	23.4190	0.005208	191
1	2	0.0019	-0.0003	100.0	518.67	642.15	1591.82	1403.14	14.62	21.61	...	8131.49	8.4318	0.03	392	2388	100.0	39.00	23.4236	0.010417	190
1	3	-0.0043	0.0003	100.0	518.67	642.35	1587.99	1404.20	14.62	21.61	...	8133.23	8.4178	0.03	390	2388	100.0	38.95	23.3442	0.015625	189
1	4	0.0007	0.0000	100.0	518.67	642.35	1582.79	1401.87	14.62	21.61	...	8133.83	8.3682	0.03	392	2388	100.0	38.88	23.3739	0.020833	188
1	5	-0.0019	-0.0002	100.0	518.67	642.37	1582.85	1406.22	14.62	21.61	...	8133.80	8.4294	0.03	393	2388	100.0	38.90	23.4044	0.026042	187
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	196	-0.0004	-0.0003	100.0	518.67	643.49	1597.98	1428.63	14.62	21.61	...	8137.60	8.4956	0.03	397	2388	100.0	38.49	22.9735	0.980000	4
100	197	-0.0016	-0.0005	100.0	518.67	643.54	1604.50	1433.58	14.62	21.61	...	8136.50	8.5139	0.03	395	2388	100.0	38.30	23.1594	0.985000	3
100	198	0.0004	0.0000	100.0	518.67	643.42	1602.46	1428.18	14.62	21.61	...	8141.05	8.5646	0.03	398	2388	100.0	38.44	22.9333	0.990000	2
100	199	-0.0011	0.0003	100.0	518.67	643.23	1605.26	1426.53	14.62	21.61	...	8139.29	8.5389	0.03	395	2388	100.0	38.29	23.0640	0.995000	1
100	200	-0.0032	-0.0005	100.0	518.67	643.85	1600.38	1432.14	14.62	21.61	...	8137.33	8.5036	0.03	396	2388	100.0	38.37	23.0522	1.000000	0

	time_cycles	set_1	set_2	s_2	s_3	s_4	s_6	s_7	s_8	s_9	s_11	s_12	s_13	s_14	s_15	s_17	s_20	s_21	LR	RUL
engine_num
1	1	-0.0007	-0.0004	641.82	1589.70	1400.60	21.61	554.36	2388.06	9046.19	47.47	521.66	2388.02	8138.62	8.4195	392	39.06	23.4190	0.005208	191
1	2	0.0019	-0.0003	642.15	1591.82	1403.14	21.61	553.75	2388.04	9044.07	47.49	522.28	2388.07	8131.49	8.4318	392	39.00	23.4236	0.010417	190
1	3	-0.0043	0.0003	642.35	1587.99	1404.20	21.61	554.26	2388.08	9052.94	47.27	522.42	2388.03	8133.23	8.4178	390	38.95	23.3442	0.015625	189
1	4	0.0007	0.0000	642.35	1582.79	1401.87	21.61	554.45	2388.11	9049.48	47.13	522.86	2388.08	8133.83	8.3682	392	38.88	23.3739	0.020833	188
1	5	-0.0019	-0.0002	642.37	1582.85	1406.22	21.61	554.00	2388.06	9055.15	47.28	522.19	2388.04	8133.80	8.4294	393	38.90	23.4044	0.026042	187
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	196	-0.0004	-0.0003	643.49	1597.98	1428.63	21.61	551.43	2388.19	9065.52	48.07	519.49	2388.26	8137.60	8.4956	397	38.49	22.9735	0.980000	4
100	197	-0.0016	-0.0005	643.54	1604.50	1433.58	21.61	550.86	2388.23	9065.11	48.04	519.68	2388.22	8136.50	8.5139	395	38.30	23.1594	0.985000	3
100	198	0.0004	0.0000	643.42	1602.46	1428.18	21.61	550.94	2388.24	9065.90	48.09	520.01	2388.24	8141.05	8.5646	398	38.44	22.9333	0.990000	2
100	199	-0.0011	0.0003	643.23	1605.26	1426.53	21.61	550.68	2388.25	9073.72	48.39	519.67	2388.23	8139.29	8.5389	395	38.29	23.0640	0.995000	1
100	200	-0.0032	-0.0005	643.85	1600.38	1432.14	21.61	550.79	2388.26	9061.48	48.20	519.30	2388.26	8137.33	8.5036	396	38.37	23.0522	1.000000	0

	time_cycles	set_1	set_2	s_2	s_3	s_4	s_7	s_8	s_9	s_11	s_12	s_13	s_14	s_15	s_17	s_20	s_21	LR	RUL
engine_num
1	1	-0.0007	-0.0004	641.82	1589.70	1400.60	554.36	2388.06	9046.19	47.47	521.66	2388.02	8138.62	8.4195	392	39.06	23.4190	0.005208	191
1	2	0.0019	-0.0003	642.15	1591.82	1403.14	553.75	2388.04	9044.07	47.49	522.28	2388.07	8131.49	8.4318	392	39.00	23.4236	0.010417	190
1	3	-0.0043	0.0003	642.35	1587.99	1404.20	554.26	2388.08	9052.94	47.27	522.42	2388.03	8133.23	8.4178	390	38.95	23.3442	0.015625	189
1	4	0.0007	0.0000	642.35	1582.79	1401.87	554.45	2388.11	9049.48	47.13	522.86	2388.08	8133.83	8.3682	392	38.88	23.3739	0.020833	188
1	5	-0.0019	-0.0002	642.37	1582.85	1406.22	554.00	2388.06	9055.15	47.28	522.19	2388.04	8133.80	8.4294	393	38.90	23.4044	0.026042	187
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	196	-0.0004	-0.0003	643.49	1597.98	1428.63	551.43	2388.19	9065.52	48.07	519.49	2388.26	8137.60	8.4956	397	38.49	22.9735	0.980000	4
100	197	-0.0016	-0.0005	643.54	1604.50	1433.58	550.86	2388.23	9065.11	48.04	519.68	2388.22	8136.50	8.5139	395	38.30	23.1594	0.985000	3
100	198	0.0004	0.0000	643.42	1602.46	1428.18	550.94	2388.24	9065.90	48.09	520.01	2388.24	8141.05	8.5646	398	38.44	22.9333	0.990000	2
100	199	-0.0011	0.0003	643.23	1605.26	1426.53	550.68	2388.25	9073.72	48.39	519.67	2388.23	8139.29	8.5389	395	38.29	23.0640	0.995000	1
100	200	-0.0032	-0.0005	643.85	1600.38	1432.14	550.79	2388.26	9061.48	48.20	519.30	2388.26	8137.33	8.5036	396	38.37	23.0522	1.000000	0

	time_cycles	s_2	s_3	s_4	s_7	s_8	s_9	s_11	s_12	s_13	s_15	s_17	s_20	s_21	LR	RUL
engine_num
1	1	641.82	1589.70	1400.60	554.36	2388.06	9046.19	47.47	521.66	2388.02	8.4195	392	39.06	23.4190	0.005208	191
1	2	642.15	1591.82	1403.14	553.75	2388.04	9044.07	47.49	522.28	2388.07	8.4318	392	39.00	23.4236	0.010417	190
1	3	642.35	1587.99	1404.20	554.26	2388.08	9052.94	47.27	522.42	2388.03	8.4178	390	38.95	23.3442	0.015625	189
1	4	642.35	1582.79	1401.87	554.45	2388.11	9049.48	47.13	522.86	2388.08	8.3682	392	38.88	23.3739	0.020833	188
1	5	642.37	1582.85	1406.22	554.00	2388.06	9055.15	47.28	522.19	2388.04	8.4294	393	38.90	23.4044	0.026042	187
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100	196	643.49	1597.98	1428.63	551.43	2388.19	9065.52	48.07	519.49	2388.26	8.4956	397	38.49	22.9735	0.980000	4
100	197	643.54	1604.50	1433.58	550.86	2388.23	9065.11	48.04	519.68	2388.22	8.5139	395	38.30	23.1594	0.985000	3
100	198	643.42	1602.46	1428.18	550.94	2388.24	9065.90	48.09	520.01	2388.24	8.5646	398	38.44	22.9333	0.990000	2
100	199	643.23	1605.26	1426.53	550.68	2388.25	9073.72	48.39	519.67	2388.23	8.5389	395	38.29	23.0640	0.995000	1
100	200	643.85	1600.38	1432.14	550.79	2388.26	9061.48	48.20	519.30	2388.26	8.5036	396	38.37	23.0522	1.000000	0

Remaining useful life prediction of a turbo engine

Importing necessary libraries and train files¶

Exploring of the training dataset¶

Optimizing the dataset for machine learning (selecting the independant variables) by removing the highly correlated variables¶

standardizing sensor values¶

linear regression to fetch the sensor slopes¶

PCA with all the sensor readings¶

RUL prediction ¶

obtaining HI from the sensors data¶

Predicting RUL¶

	column
Engine list
1	s_6
6	s_6
8	s_6
12	s_6
13	s_6
14	s_6
16	s_6
19	s_6
20	s_6
22	s_6
23	s_6
25	s_6
26	s_6
29	s_6
32	s_6
35	s_6
36	s_6
37	s_6
38	s_6
39	s_6
40	s_6
49	s_6
55	s_6
56	s_6
58	s_6
60	s_6
62	s_6
63	s_6
70	s_6
71	s_6
73	s_6
75	s_6
76	s_6
80	s_6
84	s_6
87	s_6
91	s_6
94	s_6