Skip to the content.

Fradulent Transactions

(Classification of Imbalanced Dataset)

Introduction

Outline of the Project is

General overview of the data

# Importing the required Libraries
import numpy as np
import matplotlib.pyplot plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# A quick info
 df = pd.read_csv("Fraud.csv")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB

</div>

# general view
df.head(5)
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 PAYMENT 9839.64 C1231006815 170136.0 160296.36 M1979787155 0.0 0.0 0 0
1 1 PAYMENT 1864.28 C1666544295 21249.0 19384.72 M2044282225 0.0 0.0 0 0
2 1 TRANSFER 181.00 C1305486145 181.0 0.00 C553264065 0.0 0.0 1 0
3 1 CASH_OUT 181.00 C840083671 181.0 0.00 C38997010 21182.0 0.0 1 0
4 1 PAYMENT 11668.14 C2048537720 41554.0 29885.86 M1230701703 0.0 0.0 0 0
plt.style.use('dark_background')
fig = plt.gcf()
fig.set_size_inches(10,6)
sns.scatterplot(x = 'oldbalanceOrg', y = 'newbalanceDest', data = df[df['isFraud'] == 1], color = 'red', label = 'Fraud', marker = '*');
sns.scatterplot(x = 'oldbalanceOrg', y = 'newbalanceDest', data = df[df['isFraud'] == 0], color = 'green', label = 'non-Fraud', marker = '+');
plt.legend()
plt.show()

</div>

# Fraud vs non-fraud transactions count
print('Number of fraud Transactions = ', df[df['isFraud'] == 1].shape[0])
print('Number of non-fraud Transactions = ', df[df['isFraud'] == 0].shape[0])
sns.countplot(x = 'isFraud', data = df);
Number of fraud Transactions =  8213
Number of non-fraud Transactions =  6354407

</div>

Visualization

</section>

# Getting a sample of 5000 and saving it for future reference
df_vis= df.sample(5000, random_state = 0)
df_vis.to_csv('df_vis.csv')
df_vis = pd.read_csv('df_vis.csv')

Does the fraud transactions happen for high amount? If yes then which mode is being used?

sns.swarmplot(x = 'type', y = 'amount', hue = 'isFraud', data = df_vis);

Does the balance in acount lead towards fraud transactions? If yes then how much amount?

sns.scatterplot(x = 'oldbalanceOrg', y = 'amount', hue = 'isFraud', data = df_vis);

Skewness in the data

for col in ['amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']:
  sns.boxenplot(x=col, data = df_vis);
  plt.show()
  print(col,'\n')
 

amount 

oldbalanceOrg 

newbalanceOrig 

oldbalanceDest 

newbalanceDest 

Issues with default classifier for imbalanced data

Let us create a some classifier to see the performance without changing anything.

Default Classfiers

# copy of data to check the performance
df_def = df.copy()

# Droping non requiered features
df_def = df_def.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

# Encoding categorical variables
df_def = pd.get_dummies(df_def)
# log transformation of skewed fetures
sfeature = ['amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df_def[sfeature] = np.log(df_def[sfeature] + 1)
df_def.head()
step amount oldbalanceOrg newbalanceOrig oldbalanceDest newbalanceDest isFraud type_CASH_IN type_CASH_OUT type_DEBIT type_PAYMENT type_TRANSFER
0 1 9.194276 12.044359 11.984786 0.000000 0.0 0 0 0 0 1 0
1 1 7.531166 9.964112 9.872292 0.000000 0.0 0 0 0 0 1 0
2 1 5.204007 5.204007 0.000000 0.000000 0.0 1 0 0 0 0 1
3 1 5.204007 5.204007 0.000000 9.960954 0.0 1 0 1 0 0 0
4 1 9.364703 10.634773 10.305174 0.000000 0.0 0 0 0 0 1 0

Model Pipeline

# features and target
features = df_def.drop('isFraud', axis = True)
target = df_def['isFraud']

# Splitting the train and test data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 0.3, random_state = 0)

# Importing the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Performace metircs
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

def performace(xtrain, ytrain, xtest, ytest, classifier):
  ypred = classifier.predict(xtest)
  report = classification_report(ytest,ypred)
  cm = confusion_matrix(ytest,ypred)
  print(report)
  disp = ConfusionMatrixDisplay(confusion_matrix = cm)
  disp.plot()
  plt.show()

  # Result
for classifier in [LogisticRegression(random_state = 0), RandomForestClassifier(random_state = 0), GaussianNB()]:
  classifier.fit(xtrain, ytrain)
  print(classifier)
  performace(xtrain, ytrain, xtest, ytest, classifier)
LogisticRegression(random_state=0)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906367
           1       0.86      0.48      0.62      2419

    accuracy                           1.00   1908786
   macro avg       0.93      0.74      0.81   1908786
weighted avg       1.00      1.00      1.00   1908786

RandomForestClassifier(random_state=0)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906367
           1       0.98      0.79      0.88      2419

    accuracy                           1.00   1908786
   macro avg       0.99      0.90      0.94   1908786
weighted avg       1.00      1.00      1.00   1908786

GaussianNB()
              precision    recall  f1-score   support

           0       1.00      0.82      0.90   1906367
           1       0.01      0.97      0.01      2419

    accuracy                           0.82   1908786
   macro avg       0.50      0.89      0.46   1908786
weighted avg       1.00      0.82      0.90   1908786

Classification with resampling

Resampling

Resampling can be done in two ways:

Undersampling (Resampling)

# selecting the class 1 samples
df1 = df[df['isFraud'] == 1]

# Selectiong the equal number of random class 0 samples
df0 = df[df['isFraud'] == 0].sample(df1.shape[0])

# Merging to get the final data framd
df_und_sam = pd.concat([df1, df0])

# Saving it for future refrence
df_und_sam.to_csv('Under_sampled_data.csv')
df = pd.read_csv('Under_sampled_data.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16426 entries, 0 to 16425
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      16426 non-null  int64  
 1   step            16426 non-null  int64  
 2   type            16426 non-null  object 
 3   amount          16426 non-null  float64
 4   nameOrig        16426 non-null  object 
 5   oldbalanceOrg   16426 non-null  float64
 6   newbalanceOrig  16426 non-null  float64
 7   nameDest        16426 non-null  object 
 8   oldbalanceDest  16426 non-null  float64
 9   newbalanceDest  16426 non-null  float64
 10  isFraud         16426 non-null  int64  
 11  isFlaggedFraud  16426 non-null  int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 1.5+ MB
# General view of the data
plt.style.use('dark_background')
fig = plt.gcf()
fig.set_size_inches(5, 5)
sns.scatterplot(x = 'oldbalanceOrg', y = 'newbalanceDest', data = df[df['isFraud'] == 1], color = 'red', label = 'Fraud', marker = '*');
sns.scatterplot(x = 'oldbalanceOrg', y = 'newbalanceDest', data = df[df['isFraud'] == 0], color = 'green', label = 'non-Fraud', marker = '+');
plt.ylim(-0.01*(10**8), 0.5*(10**8))
plt.legend()
plt.show()

Visualization

# Amount Vs transaction type
sns.swarmplot(x = 'type', y = 'amount', data = df.sample(1000), hue='isFraud');

# Skewness in the numerical features
for col in ['amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']:
  sns.boxenplot(x=col, data = df);
  plt.show()
  print(col,'\n')

amount 

oldbalanceOrg 

newbalanceOrig 

oldbalanceDest 

newbalanceDest 

Data processing

# Droping non requiered features
df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

# Encoding categorical variables
df = pd.get_dummies(df)

# log transformation of skewed fetures
sfeature = ['amount','oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df[sfeature] = np.log(df[sfeature] + 1)

Model Pipeling

# features and target
features = df.drop('isFraud', axis = True)
target = df['isFraud']

# Splitting the train and test data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(features, target, test_size = 0.3, random_state = 0)

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Performace metircs
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

def performace(xtrain, ytrain, xtest, ytest, classifier):
  ypred = classifier.predict(xtest)
  report = classification_report(ytest,ypred)
  cm = confusion_matrix(ytest,ypred)
  print(report)
  disp = ConfusionMatrixDisplay(confusion_matrix = cm)
  disp.plot()
  plt.show()

# Result
for classifier in [LogisticRegression(random_state = 0), RandomForestClassifier(random_state = 0), GaussianNB()]:
  classifier.fit(xtrain, ytrain)
  print(classifier)
  performace(xtrain, ytrain, xtest, ytest, classifier)
LogisticRegression(random_state=0)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2490
           1       0.49      1.00      0.66      2438

    accuracy                           0.49      4928
   macro avg       0.25      0.50      0.33      4928
weighted avg       0.24      0.49      0.33      4928


RandomForestClassifier(random_state=0)
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2490
           1       0.99      0.99      0.99      2438

    accuracy                           0.99      4928
   macro avg       0.99      0.99      0.99      4928
weighted avg       0.99      0.99      0.99      4928


GaussianNB()
              precision    recall  f1-score   support

           0       0.63      0.74      0.68      2490
           1       0.68      0.56      0.62      2438

    accuracy                           0.65      4928
   macro avg       0.66      0.65      0.65      4928
weighted avg       0.66      0.65      0.65      4928


Conclusion