fork download
  1. # This Python 3 environment comes with many helpful analytics libraries installed
  2. # It is defined by the kaggle/python Docker image: https://g...content-available-to-author-only...b.com/kaggle/docker-python
  3. # For example, here's several helpful packages to load
  4.  
  5. import numpy as np # linear algebra
  6. import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  7.  
  8. # Input data files are available in the read-only "../input/" directory
  9. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
  10.  
  11. import os
  12. for dirname, _, filenames in os.walk('/kaggle/input'):
  13. for filename in filenames:
  14. print(os.path.join(dirname, filename))
  15.  
  16. # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
  17. # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
  18. import pandas as pd # use for data manipulation and analysis
  19. import numpy as np # use for multi-dimensional array and matrix
  20.  
  21. import seaborn as sns # use for high-level interface for drawing attractive and informative statistical graphics
  22. import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into application
  23. # It sets the backend of matplotlib to the 'inline' backend:
  24. import plotly.express as px
  25. import time # calculate time
  26.  
  27. from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
  28. from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad
  29.  
  30. from sklearn.model_selection import train_test_split # spliting the data between feature and target
  31. from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
  32. from sklearn.metrics import confusion_matrix # gives info about actual and predict
  33. from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text
  34. from nltk.stem.snowball import SnowballStemmer # stemmes words
  35. from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes
  36. from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos
  37.  
  38. from PIL import Image # getting images in notebook
  39. from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator# creates words colud
  40.  
  41. from bs4 import BeautifulSoup # use for scraping the data from website
  42. from selenium import webdriver # use for automation chrome
  43. import networkx as nx # for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.
  44.  
  45. import pickle# use to dump model
  46.  
  47. import warnings # ignores pink warnings
  48. warnings.filterwarnings('ignore')
  49. # Loading the dataset
  50. phish_data = pd.read_csv('/kaggle/input/phishing-site-urls/phishing_site_urls.csv')
  51. phish_data.head()
  52. phish_data.tail()
  53. phish_data.info()
  54. phish_data.isnull().sum() # there is no
  55. #create a dataframe of classes counts
  56. label_counts = pd.DataFrame(phish_data.Label.value_counts())
  57. #visualizing target_col
  58. fig = px.bar(label_counts, x=label_counts.index, y=label_counts.Label)
  59. fig.show()
  60. tokenizer = RegexpTokenizer(r'[A-Za-z]+')#to getting alpha only
  61. phish_data.URL[0]
  62. # this will be pull letter which matches to expression
  63. tokenizer.tokenize(phish_data.URL[0]) # using first row
  64. print('Getting words tokenized ...')
  65. t0= time.perf_counter()
  66. phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
  67. t1 = time.perf_counter() - t0
  68. print('Time taken',t1 ,'sec')
  69. phish_data.sample(5)
  70. stemmer = SnowballStemmer("english") # choose a language
  71. print('Getting words stemmed ...')
  72. t0= time.perf_counter()
  73. phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
  74. t1= time.perf_counter() - t0
  75. print('Time taken',t1 ,'sec')
  76. phish_data.sample(5)
  77. print('Getting joiningwords ...')
  78. t0= time.perf_counter()
  79. phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))
  80. t1= time.perf_counter() - t0
  81. print('Time taken',t1 ,'sec')
  82. phish_data.sample(5)
  83. #sliceing classes
  84. bad_sites = phish_data[phish_data.Label == 'bad']
  85. good_sites = phish_data[phish_data.Label == 'good']
  86. bad_sites.head()
  87. good_sites.head()
  88. def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(24.0,16.0),
  89. title = None, title_size=40, image_color=False):
  90. stopwords = set(STOPWORDS)
  91. more_stopwords = {'com','http'}
  92. stopwords = stopwords.union(more_stopwords)
  93.  
  94. wordcloud = WordCloud(background_color='white',
  95. stopwords = stopwords,
  96. max_words = max_words,
  97. max_font_size = max_font_size,
  98. random_state = 42,
  99. mask = mask)
  100. wordcloud.generate(text)
  101.  
  102. plt.figure(figsize=figure_size)
  103. if image_color:
  104. image_colors = ImageColorGenerator(mask);
  105. plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
  106. plt.title(title, fontdict={'size': title_size,
  107. 'verticalalignment': 'bottom'})
  108. else:
  109. plt.imshow(wordcloud);
  110. plt.title(title, fontdict={'size': title_size, 'color': 'green',
  111. 'verticalalignment': 'bottom'})
  112. plt.axis('off');
  113. plt.tight_layout()
  114. d = '../input/masks/masks-wordclouds/'
  115. data = good_sites.text_sent
  116. data.reset_index(drop=True, inplace=True)
  117. common_text = str(data)
  118. common_mask = np.array(Image.open(d+'star.png'))
  119. plot_wordcloud(common_text, common_mask, max_words=400, max_font_size=120,
  120. title = 'Most common words use in good urls', title_size=15)
  121. data = bad_sites.text_sent
  122. data.reset_index(drop=True, inplace=True)
  123. common_text = str(data)
  124. common_mask = np.array(Image.open(d+'comment.png'))
  125. plot_wordcloud(common_text, common_mask, max_words=400, max_font_size=120,
  126. title = 'Most common words use in bad urls', title_size=15)
  127. #create cv object
  128. cv = CountVectorizer()
  129. help(CountVectorizer())
  130. feature = cv.fit_transform(phish_data.text_sent) #transform all text which we tokenize and stemed
  131. feature[:5].toarray() # convert sparse matrix into array to print transformed features
  132. trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)
  133. # create lr object
  134. lr = LogisticRegression()
  135. lr.fit(trainX,trainY)
  136. lr.score(testX,testY)
  137. Scores_ml = {}
  138. Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)
  139. print('Training Accuracy :',lr.score(trainX,trainY))
  140. print('Training Accuracy :',lr.score(trainX,trainY))
  141. print('Testing Accuracy :',lr.score(testX,testY))
  142. con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
  143. columns = ['Predicted:Bad', 'Predicted:Good'],
  144. index = ['Actual:Bad', 'Actual:Good'])
  145.  
  146.  
  147. print('\nCLASSIFICATION REPORT\n')
  148. print(classification_report(lr.predict(testX), testY,
  149. target_names =['Bad','Good']))
  150.  
  151. print('\nCONFUSION MATRIX')
  152. plt.figure(figsize= (6,4))
  153. sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
  154. print('Training Accuracy :',lr.score(trainX,trainY))
  155. print('Testing Accuracy :',lr.score(testX,testY))
  156. con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
  157. columns = ['Predicted:Bad', 'Predicted:Good'],
  158. index = ['Actual:Bad', 'Actual:Good'])
  159.  
  160.  
  161. print('\nCLASSIFICATION REPORT\n')
  162. print(classification_report(lr.predict(testX), testY,
  163. target_names =['Bad','Good']))
  164.  
  165. print('\nCONFUSION MATRIX')
  166. plt.figure(figsize= (6,4))
  167. sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
  168. # create mnb object
  169. mnb = MultinomialNB()
  170. mnb.fit(trainX,trainY)
  171. Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)
  172. print('Training Accuracy :',mnb.score(trainX,trainY))
  173. print('Testing Accuracy :',mnb.score(testX,testY))
  174. con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
  175. columns = ['Predicted:Bad', 'Predicted:Good'],
  176. index = ['Actual:Bad', 'Actual:Good'])
  177.  
  178.  
  179. print('\nCLASSIFICATION REPORT\n')
  180. print(classification_report(mnb.predict(testX), testY,
  181. target_names =['Bad','Good']))
  182.  
  183. print('\nCONFUSION MATRIX')
  184. plt.figure(figsize= (6,4))
  185. sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
  186. acc = pd.DataFrame.from_dict(Scores_ml,orient = 'index',columns=['Accuracy'])
  187. sns.set_style('darkgrid')
  188. sns.barplot(acc.index,acc.Accuracy)
  189. ipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
  190. ##(r'\b(?:http|ftp)s?://\S*\w|\w+|[^\w\s]+') ([a-zA-Z]+)([0-9]+) -- these tolenizers giving me low accuray
  191. trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)
  192. pipeline_ls.fit(trainX,trainY)
  193. pipeline_ls.score(testX,testY)
  194. print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
  195. print('Testing Accuracy :',pipeline_ls.score(testX,testY))
  196. con_mat = pd.DataFrame(confusion_matrix(ipeline_ls.predict(testX), testY),
  197. columns = ['Predicted:Bad', 'Predicted:Good'],
  198. index = ['Actual:Bad', 'Actual:Good'])
  199.  
  200.  
  201. print('\nCLASSIFICATION REPORT\n')
  202. print(classification_report(pipeline_ls.predict(testX), testY,
  203. target_names =['Bad','Good']))
  204.  
  205. print('\nCONFUSION MATRIX')
  206. plt.figure(figsize= (6,4))
  207. sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
  208. pickle.dump(pipeline_ls,open('phishing.pkl','wb'))
  209. loaded_model = pickle.load(open('phishing.pkl', 'rb'))
  210. result = loaded_model.score(testX,testY)
  211. print(result)
  212. predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad','tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
  213. predict_good = ['youtube.com/','youtube.com/watch?v=qI0TQJI3vdU','retailhellunderground.com/','restorevisioncenters.com/html/technology.html']
  214. loaded_model = pickle.load(open('phishing.pkl', 'rb'))
  215. #predict_bad = vectorizers.transform(predict_bad)
  216. # predict_good = vectorizer.transform(predict_good)
  217. result = loaded_model.predict(predict_bad)
  218. result2 = loaded_model.predict(predict_good)
  219. print(result)
  220. print("*"*30)
  221. print(result2)
Success #stdin #stdout 0.03s 25624KB
stdin
Standard input is empty
stdout
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://g...content-available-to-author-only...b.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd # use for data manipulation and analysis
import numpy as np # use for multi-dimensional array and matrix

import seaborn as sns # use for high-level interface for drawing attractive and informative statistical graphics 
import matplotlib.pyplot as plt # It provides an object-oriented API for embedding plots into application
# It sets the backend of matplotlib to the 'inline' backend:
import plotly.express as px
import time # calculate time 

from sklearn.linear_model import LogisticRegression # algo use to predict good or bad
from sklearn.naive_bayes import MultinomialNB # nlp algo use to predict good or bad

from sklearn.model_selection import train_test_split # spliting the data between feature and target
from sklearn.metrics import classification_report # gives whole report about metrics (e.g, recall,precision,f1_score,c_m)
from sklearn.metrics import confusion_matrix # gives info about actual and predict
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text  
from nltk.stem.snowball import SnowballStemmer # stemmes words
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes  
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos

from PIL import Image # getting images in notebook
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator# creates words colud

from bs4 import BeautifulSoup # use for scraping the data from website
from selenium import webdriver # use for automation chrome 
import networkx as nx # for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.

import pickle# use to dump model 

import warnings # ignores pink warnings 
warnings.filterwarnings('ignore')
# Loading the dataset
phish_data = pd.read_csv('/kaggle/input/phishing-site-urls/phishing_site_urls.csv')
phish_data.head()
phish_data.tail()
phish_data.info()
phish_data.isnull().sum() # there is no 
#create a dataframe of classes counts
label_counts = pd.DataFrame(phish_data.Label.value_counts())
#visualizing target_col
fig = px.bar(label_counts, x=label_counts.index, y=label_counts.Label)
fig.show()
tokenizer = RegexpTokenizer(r'[A-Za-z]+')#to getting alpha only
phish_data.URL[0]
# this will be pull letter which matches to expression
tokenizer.tokenize(phish_data.URL[0]) # using first row
print('Getting words tokenized ...')
t0= time.perf_counter()
phish_data['text_tokenized'] = phish_data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')
phish_data.sample(5)
stemmer = SnowballStemmer("english") # choose a language
print('Getting words stemmed ...')
t0= time.perf_counter()
phish_data['text_stemmed'] = phish_data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')
phish_data.sample(5)
print('Getting joiningwords ...')
t0= time.perf_counter()
phish_data['text_sent'] = phish_data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')
phish_data.sample(5)
#sliceing classes
bad_sites = phish_data[phish_data.Label == 'bad']
good_sites = phish_data[phish_data.Label == 'good']
bad_sites.head()
good_sites.head()
def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(24.0,16.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'com','http'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='white',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    mask = mask)
    wordcloud.generate(text)
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'green', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
d = '../input/masks/masks-wordclouds/'
data = good_sites.text_sent
data.reset_index(drop=True, inplace=True)
common_text = str(data)
common_mask = np.array(Image.open(d+'star.png'))
plot_wordcloud(common_text, common_mask, max_words=400, max_font_size=120, 
               title = 'Most common words use in good urls', title_size=15)
data = bad_sites.text_sent
data.reset_index(drop=True, inplace=True)
common_text = str(data)
common_mask = np.array(Image.open(d+'comment.png'))
plot_wordcloud(common_text, common_mask, max_words=400, max_font_size=120, 
               title = 'Most common words use in bad urls', title_size=15)
#create cv object
cv = CountVectorizer()
help(CountVectorizer())
feature = cv.fit_transform(phish_data.text_sent) #transform all text which we tokenize and stemed
feature[:5].toarray() # convert sparse matrix into array to print transformed features
trainX, testX, trainY, testY = train_test_split(feature, phish_data.Label)
# create lr object
lr = LogisticRegression()
lr.fit(trainX,trainY)
lr.score(testX,testY)
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)
print('Training Accuracy :',lr.score(trainX,trainY))
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
# create mnb object
mnb = MultinomialNB()
mnb.fit(trainX,trainY)
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
acc = pd.DataFrame.from_dict(Scores_ml,orient = 'index',columns=['Accuracy'])
sns.set_style('darkgrid')
sns.barplot(acc.index,acc.Accuracy)
ipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
##(r'\b(?:http|ftp)s?://\S*\w|\w+|[^\w\s]+') ([a-zA-Z]+)([0-9]+)  -- these tolenizers giving me low accuray 
trainX, testX, trainY, testY = train_test_split(phish_data.URL, phish_data.Label)
pipeline_ls.fit(trainX,trainY)
pipeline_ls.score(testX,testY) 
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(ipeline_ls.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.score(testX,testY)
print(result)
predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad','tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
predict_good = ['youtube.com/','youtube.com/watch?v=qI0TQJI3vdU','retailhellunderground.com/','restorevisioncenters.com/html/technology.html']
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
#predict_bad = vectorizers.transform(predict_bad)
# predict_good = vectorizer.transform(predict_good)
result = loaded_model.predict(predict_bad)
result2 = loaded_model.predict(predict_good)
print(result)
print("*"*30)
print(result2)