import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("bs140513_032310.csv")

data.head()


pd.plotting.scatter_matrix(data, figsize = (10,8))
plt.savefig("Scatter_matrix.png")


print(data.nunique())

step             180
customer        4112
age                8
gender             4
zipcodeOri         1
merchant          50
zipMerchant        1
category          15
amount         23767
fraud              2
dtype: int64


data = data.dropna()
data = data.drop(["step", "zipcodeOri", "zipMerchant"],  axis = 1)

data['customer'] = data['customer'].str.replace("\'", "")
data['age'] = data['age'].str.replace("\'", "")
data['gender'] = data['gender'].str.replace("\'", "")
data['merchant'] = data['merchant'].str.replace("\'", "")
data['category'] = data['category'].str.replace("\'", "")

data.head()


from sklearn.preprocessing import LabelEncoder, StandardScaler
le = LabelEncoder()

data["customer"] = le.fit_transform(data["customer"])
data["age"] = le.fit_transform(data["age"])
data["gender"] = le.fit_transform(data["gender"])
data["merchant"] = le.fit_transform(data["merchant"])
data["category"] = le.fit_transform(data["category"])
data.head()


data_x = data[data.columns.difference(["fraud"])]
data_y = data['fraud']

data_x = pd.get_dummies(data_x, columns=['age', 'gender', 'category'])

sc = StandardScaler()
data_x = sc.fit_transform(data_x)
data_x = pd.DataFrame(data_x)
data_x.head()


print(data_x.nunique())

0     23767
1      4112
2        50
3         2
4         2
5         2
6         2
7         2
8         2
9         2
10        2
11        2
12        2
13        2
14        2
15        2
16        2
17        2
18        2
19        2
20        2
21        2
22        2
23        2
24        2
25        2
26        2
27        2
28        2
29        2
dtype: int64


data_x.head()


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size = 0.6)


from sklearn.metrics import classification_report


from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)

cm_svc = pd.crosstab(y_pred_svc, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_svc = classification_report(y_test, y_pred_svc, labels = [0, 1])
print(cm_svc, '\n\n', report_svc)

true       0     1     All
pred
0     234692   993  235685
1        207  1966    2173
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.90      0.66      0.77      2959

    accuracy                           0.99    237858
   macro avg       0.95      0.83      0.88    237858
weighted avg       0.99      0.99      0.99    237858


from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

cm_lr = pd.crosstab(y_pred_lr, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_lr = classification_report(y_test, y_pred_lr, labels = [0, 1])
print(cm_lr, '\n\n', report_lr)

true       0     1     All
pred
0     234724  1121  235845
1        175  1838    2013
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.91      0.62      0.74      2959

    accuracy                           0.99    237858
   macro avg       0.95      0.81      0.87    237858
weighted avg       0.99      0.99      0.99    237858


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)

cm_rfc = pd.crosstab(y_pred_rfc, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_rfc = classification_report(y_test, y_pred_rfc, labels = [0, 1])
print(cm_rfc, '\n\n', report_rfc)

true       0     1     All
pred
0     234588   764  235352
1        311  2195    2506
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.88      0.74      0.80      2959

    accuracy                           1.00    237858
   macro avg       0.94      0.87      0.90    237858
weighted avg       1.00      1.00      1.00    237858


from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
y_pred_dtc = dtc.predict(x_test)

cm_dtc = pd.crosstab(y_pred_dtc, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_dtc = classification_report(y_test, y_pred_dtc, labels = [0, 1])
print(cm_dtc, '\n\n', report_dtc)

true       0     1     All
pred
0     234129   715  234844
1        770  2244    3014
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.74      0.76      0.75      2959

    accuracy                           0.99    237858
   macro avg       0.87      0.88      0.87    237858
weighted avg       0.99      0.99      0.99    237858


from sklearn.linear_model import Ridge
rc = Ridge()
rc.fit(x_train, y_train)
y_pred_rc = dtc.predict(x_test)

cm_rc = pd.crosstab(y_pred_rc, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_rc = classification_report(y_test, y_pred_rc, labels = [0, 1])
print(cm_rc, '\n\n', report_rc)

true       0     1     All
pred
0     234129   715  234844
1        770  2244    3014
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.74      0.76      0.75      2959

    accuracy                           0.99    237858
   macro avg       0.87      0.88      0.87    237858
weighted avg       0.99      0.99      0.99    237858


from sklearn.ensemble import AdaBoostClassifier
adab = AdaBoostClassifier()
adab.fit(x_train, y_train)
y_pred_adab = adab.predict(x_test)

cm_adab = pd.crosstab(y_pred_adab, y_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_adab = classification_report(y_test, y_pred_adab, labels = [0, 1])
print(cm_adab, '\n\n', report_adab)

true       0     1     All
pred
0     234623   952  235575
1        276  2007    2283
All   234899  2959  237858

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    234899
           1       0.88      0.68      0.77      2959

    accuracy                           0.99    237858
   macro avg       0.94      0.84      0.88    237858
weighted avg       0.99      0.99      0.99    237858


from sklearn.model_selection import KFold, cross_val_predict
cross_val = KFold(n_splits = 10, random_state= None, shuffle = False)


lr_cv = LogisticRegression()
y_pred_lr_cv = cross_val_predict(lr_cv, x_train, y_train, cv = cross_val)
cm_lr_cv = pd.crosstab(y_pred_lr_cv, y_train, rownames = ['pred'], colnames = ['true'], margins = True)
report_lr_cv = classification_report(y_train, y_pred_lr_cv, labels = [0, 1])
print(cm_lr_cv, '\n\n', report_lr_cv)

true       0     1     All
pred
0     352265  1569  353834
1        279  2672    2951
All   352544  4241  356785

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    352544
           1       0.91      0.63      0.74      4241

    accuracy                           0.99    356785
   macro avg       0.95      0.81      0.87    356785
weighted avg       0.99      0.99      0.99    356785


rfc_cv = RandomForestClassifier()
y_pred_rfc_cv = cross_val_predict(rfc_cv, x_train, y_train, cv = cross_val)
cm_rfc_cv = pd.crosstab(y_pred_rfc_cv, y_train, rownames = ['pred'], colnames = ['true'], margins = True)
report_rfc_cv = classification_report(y_train, y_pred_rfc_cv, labels = [0, 1])
print(cm_rfc_cv, '\n\n', report_rfc_cv)

true       0     1     All
pred
0     352078  1045  353123
1        466  3196    3662
All   352544  4241  356785

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    352544
           1       0.87      0.75      0.81      4241

    accuracy                           1.00    356785
   macro avg       0.93      0.88      0.90    356785
weighted avg       1.00      1.00      1.00    356785


dtc_cv = DecisionTreeClassifier()
y_pred_dtc_cv = cross_val_predict(dtc_cv, x_train, y_train, cv = cross_val)
cm_dtc_cv = pd.crosstab(y_pred_dtc_cv, y_train, rownames = ['pred'], colnames = ['true'], margins = True)
report_dtc_cv = classification_report(y_train, y_pred_dtc_cv, labels = [0, 1])
print(cm_dtc_cv, '\n\n', report_dtc_cv)

true       0     1     All
pred
0     351462  1040  352502
1       1082  3201    4283
All   352544  4241  356785

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    352544
           1       0.75      0.75      0.75      4241

    accuracy                           0.99    356785
   macro avg       0.87      0.88      0.87    356785
weighted avg       0.99      0.99      0.99    356785


rc_cv = Ridge()


adab_cv = AdaBoostClassifier()
y_pred_adab_cv = cross_val_predict(adab_cv, x_train, y_train, cv = cross_val)
cm_adab_cv = pd.crosstab(y_pred_adab_cv, y_train, rownames = ['pred'], colnames = ['true'], margins = True)
report_adab_cv = classification_report(y_train, y_pred_adab_cv, labels = [0, 1])
print(cm_adab_cv, '\n\n', report_adab_cv)

true       0     1     All
pred
0     352118  1335  353453
1        426  2906    3332
All   352544  4241  356785

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    352544
           1       0.87      0.69      0.77      4241

    accuracy                           1.00    356785
   macro avg       0.93      0.84      0.88    356785
weighted avg       0.99      1.00      0.99    356785


print(data.head())

cols = ["customer", "age", "gender", "merchant", "category", "amount"]
data_x_pca = data[data.columns.difference(["fraud"])]
data_x_pca = data_x_pca[cols]
print(data_x_pca.head())


import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis

pca = PCA()
data_x_pca = pca.fit(data_x_pca)
pca_components = pca.components_

print("explained variance ratio:\n\n", pca.explained_variance_ratio_)

var_ratio = pd.DataFrame({'var':pca.explained_variance_ratio_,
             'PC':['pc1','pc2','pc3','pc4','pc5','pc6']})

sns.barplot(x = 'PC', y = 'var', data=var_ratio, color="c");

   customer  age  gender  merchant  category  amount  fraud
0       210    4       2        30        12    4.55      0
1      2753    2       2        30        12   39.68      0
2      2285    4       1        18        12   26.89      0
3      1650    3       2        30        12   17.25      0
4      3585    5       2        30        12   35.72      0
   customer  age  gender  merchant  category  amount
0       210    4       2        30        12    4.55
1      2753    2       2        30        12   39.68
2      2285    4       1        18        12   26.89
3      1650    3       2        30        12   17.25
4      3585    5       2        30        12   35.72
explained variance ratio:

 [9.91188343e-01 8.75286931e-03 5.21648564e-05 5.18593705e-06
 1.25847043e-06 1.78590666e-07]


data.head()
data_x_over = data_x
data_y_over = data_y

from imblearn.over_sampling import SMOTE

over = SMOTE()
data_x_over, data_y_over = over.fit_resample(data_x_over, data_y_over)
data_y_over = pd.DataFrame(data_y_over)
print(data_y_over.iloc[0].value_counts())

0    1
Name: 0, dtype: int64


x_over_train, x_over_test, y_over_train, y_over_test = train_test_split(data_x_over, data_y_over, train_size =0.7 )


dtc.fit(x_over_train, y_over_train)
y_pred_over = dtc.predict(x_over_test)
y_pred_over = np.array(y_pred_over)
y_over_test = np.array(y_over_test)
y_over_test = y_over_test.reshape(-1,)
print(y_pred_over.shape, y_over_test.shape)

(352466,) (352466,)


cm_dtc_over = pd.crosstab(y_pred_over, y_over_test, rownames = ['pred'], colnames = ['true'], margins = True)
report_dtc_over = classification_report(y_over_test, y_pred_over, labels = [0, 1])
print( cm_dtc_over, '\n\n', report_dtc_over)

true       0       1     All
pred
0     174267    1560  175827
1       2109  174530  176639
All   176376  176090  352466

               precision    recall  f1-score   support

           0       0.99      0.99      0.99    176376
           1       0.99      0.99      0.99    176090

    accuracy                           0.99    352466
   macro avg       0.99      0.99      0.99    352466
weighted avg       0.99      0.99      0.99    352466

	customer	age	gender	zipcodeOri	merchant	zipMerchant	category	amount
0	'C1093826151'	'4'	'M'	'28007'	'M348934600'	'28007'	'es_transportation'	4.55
1	'C352968107'	'2'	'M'	'28007'	'M348934600'	'28007'	'es_transportation'	39.68
2	'C2054744914'	'4'	'F'	'28007'	'M1823072687'	'28007'	'es_transportation'	26.89
3	'C1760612790'	'3'	'M'	'28007'	'M348934600'	'28007'	'es_transportation'	17.25
4	'C757503768'	'5'	'M'	'28007'	'M348934600'	'28007'	'es_transportation'	35.72

	customer	age	gender	merchant	category	amount
0	C1093826151	4	M	M348934600	es_transportation	4.55
1	C352968107	2	M	M348934600	es_transportation	39.68
2	C2054744914	4	F	M1823072687	es_transportation	26.89
3	C1760612790	3	M	M348934600	es_transportation	17.25
4	C757503768	5	M	M348934600	es_transportation	35.72

	0	1	2	3	4	5	6	7	8	9	...	20	21	22	23	24	25	26	27	28	29
0	-0.299276	-1.545620	0.714001	-0.064347	-0.329165	-0.678119	-0.573390	2.110495	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
1	0.016067	0.599484	0.714001	-0.064347	-0.329165	1.474668	-0.573390	-0.473822	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
2	-0.098742	0.204710	-0.682938	-0.064347	-0.329165	-0.678119	-0.573390	2.110495	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
3	-0.185275	-0.330933	0.714001	-0.064347	-0.329165	-0.678119	1.744015	-0.473822	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
4	-0.019480	1.301303	0.714001	-0.064347	-0.329165	-0.678119	-0.573390	-0.473822	2.914227	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339

	0	1	2	3	4	5	6	7	8	9	...	20	21	22	23	24	25	26	27	28	29
0	-0.299276	-1.545620	0.714001	-0.064347	-0.329165	-0.678119	-0.573390	2.110495	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
1	0.016067	0.599484	0.714001	-0.064347	-0.329165	1.474668	-0.573390	-0.473822	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
2	-0.098742	0.204710	-0.682938	-0.064347	-0.329165	-0.678119	-0.573390	2.110495	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
3	-0.185275	-0.330933	0.714001	-0.064347	-0.329165	-0.678119	1.744015	-0.473822	-0.343144	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339
4	-0.019480	1.301303	0.714001	-0.064347	-0.329165	-0.678119	-0.573390	-0.473822	2.914227	-0.217136	...	-0.057888	-0.054235	-0.10179	-0.02898	-0.039192	-0.082315	-0.063258	0.420991	-0.035011	-0.161339

Algorithm	Accuracy	True Positive (2840)	True Negative(235018)
SVC	1.00	1858 (65.42%)	234882 (99.94)
LR	0.99	1786 (62.88%)	234828 (99.91)
RFC	1.00	2193 (76.46%)	234729 (99.87)
DTC	0.99	2141 (75.38%)	234264 (99.67)
Ridge	0.99	2141 (75.38%)	234264 (99.67)
ADABoost	0.99	1925 (67.78%)	234728 (99.87)

Bank Fraud¶

1. Import the dataset and explore the data¶

Addressing the imbalance in data¶

Inference¶

2. Datacleaning and preprocessing¶

i.) Applying Classification Algorithms¶

a.) SVC¶

b.) Logistic Regression¶

c.) Random Forest Classifier¶

d.) Decision Tree Classifier¶

e.) Ridge Classifier¶

f.) ADAoost¶

ii.) Cross-Validation¶

a.) Logistic Regression¶

b.) RFC¶

c.) DTC¶

d.) Ridge Classifier¶

e.) ADABoost¶

Analysis¶

Addressing the data Imbalance¶

Conclusion¶

	customer	age	gender	merchant	category	amount
0	210	4	2	30	12	4.55
1	2753	2	2	30	12	39.68
2	2285	4	1	18	12	26.89
3	1650	3	2	30	12	17.25
4	3585	5	2	30	12	35.72

Algorithm	Accuracy	True Positive (4360)	True Negative(352425)
LR	0.99	2779 (63.73%)	352121 (99.91%)
RFC	1.00	3288 (75.41%)	351956 (99.86%)
SVC	0.99	2902 (66.55%)	352078 (99.90%)
DTC	0.99	3333 (76.44%)	351271 (99.67%)
Ridge	0.99	1544 (35.41%)	352357 (99.98%)
ADABoost	0.99	2992 (68.62%)	351968 (99.87%)