fork download
  1. import pandas as pd
  2. from sklearn.metrics import make_scorer, accuracy_score
  3.  
  4. df = pd.read_csv('titanic.csv')
  5.  
  6. include = ['Age', 'Sex', 'Embarked', 'Survived']
  7. dependent_variable = include[-1]
  8. print dependent_variable
  9.  
  10. from sklearn.ensemble import RandomForestClassifier as rf
  11.  
  12. df_ = df[include]
  13.  
  14. categoricals = [] # going to one-hot encode categorical variables
  15.  
  16. for col, col_type in df_.dtypes.iteritems():
  17. if col_type == 'object':
  18. print col
  19. categoricals.append(col)
  20. else:
  21. pass
  22. df_[col].fillna(0, inplace=True) # fill NA's with 0 for ints/floats, too generic
  23. print categoricals
  24.  
  25. # get_dummies effectively creates one-hot encoded variables
  26. df_ohe = pd.get_dummies(df_, columns=categoricals, dummy_na=False)
  27. print df_ohe
  28.  
  29. # print pd.
  30. # get_dummies(data=df, columns=dependent_variable)
  31.  
  32. x = df_ohe[df_ohe.columns.difference([dependent_variable])]
  33.  
  34. y = df_ohe[dependent_variable]
  35. #
  36. # print (x)
  37. #
  38. clf = rf()
  39. clf.fit(x, y)
  40. #
  41. model_columns = list(x.columns)
  42. #
  43. # #
  44. json_ = [{"Age": 85, "Sex": "male", "Embarked": "S"}]
  45. query = pd.get_dummies(pd.DataFrame(json_))
  46. query = query.reindex(columns=model_columns, fill_value=0)
  47.  
  48. result = clf.predict(query)
  49. print result
  50.  
  51. from sklearn.metrics import accuracy_score
  52. print accuracy_score(training_data, result)
Runtime error #stdin #stdout #stderr 0s 23336KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 1, in <module>
ImportError: No module named pandas