fork download
  1. import statsmodels.formula.api as smf
  2. import seaborn as sns
  3. import matplotlib.pyplot as plt
  4. from pandas import DataFrame
  5. import tkinter as tk
  6. import tkinter.filedialog as fd
  7. import csv
  8. import pandas as pd
  9. import collections
  10. import numpy as np
  11.  
  12.  
  13. def outlier_iqr(df, chk_column, output_column):
  14.  
  15. # 列を抽出する
  16. col = df.ix[:, chk_column]
  17. print(col)
  18. # 四分位数
  19. # q1 = col.describe()['25%']
  20. # q3 = col.describe()['75%']
  21. q75, q25 = np.percentile(col, [75, 25])
  22.  
  23. iqr = q75 - q25 # 四分位範囲
  24. print("25パーセント点", q25)
  25. print("75パーセント点", q75)
  26. print("四分位範囲", iqr)
  27.  
  28. # 外れ値の基準点
  29. outlier_min = q25 - (iqr) * 1.5
  30. outlier_max = q75 + (iqr) * 1.5
  31.  
  32. print('aa', outlier_min)
  33. print(outlier_max)
  34.  
  35. print(df[chk_column])
  36.  
  37. # 範囲から外れている値を除く
  38. for i, latency in enumerate(df[chk_column]):
  39. if latency > outlier_max or latency < outlier_min:
  40. df.iloc[i][output_column] = 1
  41. print('1')
  42. print(df.iloc[i][output_column])
  43. else:
  44. df.iloc[i][output_column] = 0
  45. print('0')
  46. print(df.iloc[i][output_column])
  47. return df
  48.  
  49.  
  50. def input_data():
  51. root = tk.Tk()
  52. root.withdraw()
  53. file = fd.askopenfilename(
  54. title="ファイルを選んでください",
  55. filetypes=[("TEXT", "csv"), ("TEXT", "py"), ("HTML", "html")]
  56. )
  57.  
  58. df = pd.read_csv(file)
  59.  
  60. return df
  61.  
  62.  
  63. df = input_data()
  64. df = outlier_iqr(df, 'latency_msec', 'outlier_index')
  65. print(df)
Runtime error #stdin #stdout #stderr 0.01s 7208KB
stdin
x	latency_msec	outlier_index
1	1700	0
3	2500	0
545	233	0
2	565	0
4	2.8	0
1	1458	0
2	215	0
3	25488	0
4	245	0
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 1, in <module>
ImportError: No module named statsmodels.formula.api