fork download
  1. # your code goes here
  2. !wget http://w...content-available-to-author-only...c.jp/~michi/toku2/book_category.tgz
  3. !tar zxf book_category.tgz
  4.  
  5.  
  6. !ls ./book_category
  7. !ls ./book_category | head
  8. !pip install mecab-python3
  9.  
  10. from sklearn.preprocessing import Normalizer
  11.  
  12.  
  13. topics = [
  14. 'computer_graphics',
  15. 'operating_systems',
  16. 'computer_security',
  17. 'application_service',
  18. 'computer_software',
  19. 'artificial_intelligence',
  20. 'search_engine',
  21. 'information_society',
  22. ]
  23.  
  24.  
  25. import glob
  26. import re
  27.  
  28. import MeCab
  29. import pandas as pd
  30.  
  31. from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
  32. from sklearn.preprocessing import LabelEncoder
  33. from sklearn.model_selection import train_test_split
  34. from sklearn.feature_extraction.text import TfidfTransformer
  35. from sklearn.pipeline import make_pipeline
  36. docs = []
  37. for topic in topics:
  38. for f in glob.glob(f"./book_category/{topic}/*.txt"):
  39.  
  40. with open(f, "r") as fin:
  41. body = "\n".join([line.strip() for line in fin if line.strip()])
  42. docs.append((topic,body))
  43.  
  44.  
  45. df = pd.DataFrame(
  46. docs,
  47. columns=["topic","body"],
  48. dtype="category"
  49. )
  50.  
  51.  
  52.  
  53. print(df.head())
  54.  
  55.  
  56. df.topic.value_counts()
  57.  
  58.  
  59. tagger = MeCab.Tagger("-Owakati")
  60.  
  61. def parse_to_wakati(text):
  62. return tagger.parse(text).strip()
  63.  
  64. df = df.assign(body_wakati=df.body.apply(parse_to_wakati))
  65.  
  66. print(df.head())
  67.  
  68. print(df.body_wakati.head())
  69.  
  70.  
  71. le = LabelEncoder()
  72. y = le.fit_transform(df.topic)
  73.  
  74.  
  75. print(le.classes_)
  76. print(le.transform(["computer_graphics"]))
  77. print(le.transform(["operating_systems"]))
  78.  
  79.  
  80.  
  81. X_train, X_test, y_train, y_test = train_test_split(
  82. df.body_wakati,
  83. y,
  84. test_size=0.2,
  85. random_state=10,
  86. shuffle=True
  87. )
  88.  
  89.  
  90. count_vect = CountVectorizer()
  91. X_train_counts = count_vect.fit_transform(df.body_wakati)
  92. tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
  93. X_train_tf = tf_transformer.transform(X_train_counts)
  94. print(X_train_tf)
  95. print(X_train_tf.shape)
  96.  
  97.  
  98. from sklearn.base import BaseEstimator, TransformerMixin
  99. from sklearn.metrics import classification_report, confusion_matrix
  100. class RulebasedEstimator(BaseEstimator, TransformerMixin):
  101. def __init__(self, label_encoder):
  102. self.le = label_encoder
  103.  
  104. def fit(self, X, y):
  105. return self
  106.  
  107. def predict(self, X):
  108. """ルールを記述"""
  109. result = []
  110. for text in X:
  111. pred = 0
  112. if re.search(r"(コンピュータ|グラフィックス|computer|graphics)", text):
  113. pred = self.le.transform(["computer_graphics"])[0]
  114. elif re.search(r"(オペレーティング|システム|operating|system)", text):
  115. pred = self.le.transform(["operating_systems"])[0]
  116. elif re.search(r"(暗号|セキュリティ|computer|security)", text):
  117. pred = self.le.transform(["computer_security"])[0]
  118. elif re.search(r"(アプリケーション|サービス)", text):
  119. pred = self.le.transform(["application_service"])[0]
  120. elif re.search(r"(コンピュータ|ソフトウェア|computer|software)", text):
  121. pred = self.le.transform(["computer_software"])[0]
  122. elif re.search(r"(人工|知能)", text):
  123. pred = self.le.transform(["artificial_intelligence"])[0]
  124. elif re.search(r"(検索|エンジン)", text):
  125. pred = self.le.transform(["search_engine"])[0]
  126. elif re.search(r"(情報|社会)", text):
  127. pred = self.le.transform(["information_society"])[0]
  128. result.append(pred)
  129. return result
  130.  
  131.  
  132. rulebased = RulebasedEstimator(label_encoder=le)
  133. rulebased_pred = rulebased.predict(X_test)
  134.  
  135.  
  136. from sklearn.metrics import confusion_matrix
  137. confusion_matrix(y_test,rulebased_pred)
  138.  
  139. print(classification_report(y_test, rulebased_pred, target_names=le.classes_))
  140.  
  141. from sklearn.decomposition import PCA
  142. from sklearn.preprocessing import StandardScaler
  143.  
  144. from sklearn.linear_model import SGDClassifier
  145. #pca = make_pipeline(StandardScaler(),PCA(n_components=2, random_state=0))
  146.  
  147. from sklearn.decomposition import TruncatedSVD
  148.  
  149. svd = TruncatedSVD(n_components=1693 , n_iter=7, random_state=42)
  150.  
  151.  
  152. text_clf = Pipeline([
  153. ('tfidf', TfidfVectorizer()),
  154. ('std',StandardScaler()),
  155. ('PCA', TruncatedSVD(n_components=1693 , n_iter=7, random_state=42)),
  156. ('clf', SGDClassifier(loss='hinge', penalty='l2',
  157. alpha=1e-3, random_state=90,
  158. max_iter=100, tol=None)),
  159. ])
  160.  
  161. text_clf.fit(X_train,y_train)
  162.  
  163. predicted = text_clf.predict(X_test)
  164.  
  165.  
  166. confusion_matrix(y_test,predicted)
  167.  
  168. print(classification_report(y_test, predicted, target_names=le.classes_))
Compilation error #stdin compilation error #stdout 0s 0KB
stdin
Standard input is empty
compilation info
Traceback (most recent call last):
  File "/usr/lib/python3.7/py_compile.py", line 143, in compile
    _optimize=optimize)
  File "<frozen importlib._bootstrap_external>", line 791, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "./prog.py", line 2
    !wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
    ^
SyntaxError: invalid syntax

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/usr/lib/python3.7/py_compile.py", line 147, in compile
    raise py_exc
py_compile.PyCompileError:   File "./prog.py", line 2
    !wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
    ^
SyntaxError: invalid syntax

stdout
Standard output is empty