# your code goes here
! wget http://w...content -available-to-author-only...c .jp /~ michi/toku2/book_category.tgz
! tar zxf book_category.tgz
! ls ./book_category
! ls ./book_category | head
! pip install mecab-python3
from sklearn.preprocessing import Normalizer
topics = [
'computer_graphics' ,
'operating_systems' ,
'computer_security' ,
'application_service' ,
'computer_software' ,
'artificial_intelligence' ,
'search_engine' ,
'information_society' ,
]
import glob
import re
import MeCab
import pandas as pd
from sklearn.feature_extraction .text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction .text import TfidfTransformer
from sklearn.pipeline import make_pipeline
docs = [ ]
for topic in topics:
for f in glob .glob ( f"./book_category/{topic}/*.txt" ) :
with open ( f, "r" ) as fin:
body = "\n " .join ( [ line.strip ( ) for line in fin if line.strip ( ) ] )
docs.append ( ( topic, body) )
df = pd.DataFrame (
docs,
columns= [ "topic" , "body" ] ,
dtype= "category"
)
print ( df.head ( ) )
df.topic .value_counts ( )
tagger = MeCab.Tagger ( "-Owakati" )
def parse_to_wakati( text) :
return tagger.parse ( text) .strip ( )
df = df.assign ( body_wakati= df.body .apply ( parse_to_wakati) )
print ( df.head ( ) )
print ( df.body_wakati .head ( ) )
le = LabelEncoder( )
y = le.fit_transform ( df.topic )
print ( le.classes_ )
print ( le.transform ( [ "computer_graphics" ] ) )
print ( le.transform ( [ "operating_systems" ] ) )
X_train, X_test, y_train, y_test = train_test_split(
df.body_wakati ,
y,
test_size= 0.2 ,
random_state= 10 ,
shuffle= True
)
count_vect = CountVectorizer( )
X_train_counts = count_vect.fit_transform ( df.body_wakati )
tf_transformer = TfidfTransformer( use_idf= False ) .fit ( X_train_counts)
X_train_tf = tf_transformer.transform ( X_train_counts)
print ( X_train_tf)
print ( X_train_tf.shape )
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix
class RulebasedEstimator( BaseEstimator, TransformerMixin) :
def __init__ ( self , label_encoder) :
self .le = label_encoder
def fit( self , X, y) :
return self
def predict( self , X) :
"""ルールを記述"""
result = [ ]
for text in X:
pred = 0
if re .search ( r"(コンピュータ|グラフィックス|computer|graphics)" , text) :
pred = self .le .transform ( [ "computer_graphics" ] ) [ 0 ]
elif re .search ( r"(オペレーティング|システム|operating|system)" , text) :
pred = self .le .transform ( [ "operating_systems" ] ) [ 0 ]
elif re .search ( r"(暗号|セキュリティ|computer|security)" , text) :
pred = self .le .transform ( [ "computer_security" ] ) [ 0 ]
elif re .search ( r"(アプリケーション|サービス)" , text) :
pred = self .le .transform ( [ "application_service" ] ) [ 0 ]
elif re .search ( r"(コンピュータ|ソフトウェア|computer|software)" , text) :
pred = self .le .transform ( [ "computer_software" ] ) [ 0 ]
elif re .search ( r"(人工|知能)" , text) :
pred = self .le .transform ( [ "artificial_intelligence" ] ) [ 0 ]
elif re .search ( r"(検索|エンジン)" , text) :
pred = self .le .transform ( [ "search_engine" ] ) [ 0 ]
elif re .search ( r"(情報|社会)" , text) :
pred = self .le .transform ( [ "information_society" ] ) [ 0 ]
result.append ( pred)
return result
rulebased = RulebasedEstimator( label_encoder= le)
rulebased_pred = rulebased.predict ( X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix( y_test, rulebased_pred)
print ( classification_report( y_test, rulebased_pred, target_names= le.classes_ ) )
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
#pca = make_pipeline(StandardScaler(),PCA(n_components=2, random_state=0))
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD( n_components= 1693 , n_iter= 7 , random_state= 42 )
text_clf = Pipeline( [
( 'tfidf' , TfidfVectorizer( ) ) ,
( 'std' , StandardScaler( ) ) ,
( 'PCA' , TruncatedSVD( n_components= 1693 , n_iter= 7 , random_state= 42 ) ) ,
( 'clf' , SGDClassifier( loss= 'hinge' , penalty= 'l2' ,
alpha= 1e-3 , random_state= 90 ,
max_iter= 100 , tol= None ) ) ,
] )
text_clf.fit ( X_train, y_train)
predicted = text_clf.predict ( X_test)
confusion_matrix( y_test, predicted)
print ( classification_report( y_test, predicted, target_names= le.classes_ ) )
IyB5b3VyIGNvZGUgZ29lcyBoZXJlCiF3Z2V0IGh0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi5jLmpwL35taWNoaS90b2t1Mi9ib29rX2NhdGVnb3J5LnRnegohdGFyIHp4ZiBib29rX2NhdGVnb3J5LnRnegoKCiFscyAuL2Jvb2tfY2F0ZWdvcnkKIWxzIC4vYm9va19jYXRlZ29yeSB8IGhlYWQKIXBpcCBpbnN0YWxsIG1lY2FiLXB5dGhvbjMKCmZyb20gc2tsZWFybi5wcmVwcm9jZXNzaW5nIGltcG9ydCBOb3JtYWxpemVyCgoKdG9waWNzID0gWwonY29tcHV0ZXJfZ3JhcGhpY3MnLAonb3BlcmF0aW5nX3N5c3RlbXMnLAonY29tcHV0ZXJfc2VjdXJpdHknLAonYXBwbGljYXRpb25fc2VydmljZScsCidjb21wdXRlcl9zb2Z0d2FyZScsCidhcnRpZmljaWFsX2ludGVsbGlnZW5jZScsCidzZWFyY2hfZW5naW5lJywKJ2luZm9ybWF0aW9uX3NvY2lldHknLApdCgoKaW1wb3J0IGdsb2IKaW1wb3J0IHJlCgppbXBvcnQgTWVDYWIKaW1wb3J0IHBhbmRhcyBhcyBwZAoKZnJvbSBza2xlYXJuLmZlYXR1cmVfZXh0cmFjdGlvbi50ZXh0IGltcG9ydCBUZmlkZlZlY3Rvcml6ZXIsIENvdW50VmVjdG9yaXplcgpmcm9tIHNrbGVhcm4ucHJlcHJvY2Vzc2luZyBpbXBvcnQgTGFiZWxFbmNvZGVyCmZyb20gc2tsZWFybi5tb2RlbF9zZWxlY3Rpb24gaW1wb3J0IHRyYWluX3Rlc3Rfc3BsaXQKZnJvbSBza2xlYXJuLmZlYXR1cmVfZXh0cmFjdGlvbi50ZXh0IGltcG9ydCBUZmlkZlRyYW5zZm9ybWVyCmZyb20gc2tsZWFybi5waXBlbGluZSBpbXBvcnQgbWFrZV9waXBlbGluZQpkb2NzID0gW10KZm9yIHRvcGljIGluIHRvcGljczoKICAgIGZvciBmIGluIGdsb2IuZ2xvYihmIi4vYm9va19jYXRlZ29yeS97dG9waWN9LyoudHh0Iik6CiAgICAgICAgCiAgICAgICAgd2l0aCBvcGVuKGYsICJyIikgYXMgZmluOgogICAgICAgICAgICBib2R5ID0gIlxuIi5qb2luKFtsaW5lLnN0cmlwKCkgZm9yIGxpbmUgaW4gZmluIGlmIGxpbmUuc3RyaXAoKV0pCiAgICAgICAgZG9jcy5hcHBlbmQoKHRvcGljLGJvZHkpKQogICAgICAKCmRmID0gcGQuRGF0YUZyYW1lKAogICAgICAgIGRvY3MsCiAgICAgICAgY29sdW1ucz1bInRvcGljIiwiYm9keSJdLAogICAgICAgIGR0eXBlPSJjYXRlZ29yeSIKKQoKCgpwcmludChkZi5oZWFkKCkpCgoKZGYudG9waWMudmFsdWVfY291bnRzKCkKCgp0YWdnZXIgPSBNZUNhYi5UYWdnZXIoIi1Pd2FrYXRpIikKCmRlZiBwYXJzZV90b193YWthdGkodGV4dCk6CiAgICByZXR1cm4gdGFnZ2VyLnBhcnNlKHRleHQpLnN0cmlwKCkKCmRmID0gZGYuYXNzaWduKGJvZHlfd2FrYXRpPWRmLmJvZHkuYXBwbHkocGFyc2VfdG9fd2FrYXRpKSkKCnByaW50KGRmLmhlYWQoKSkKCnByaW50KGRmLmJvZHlfd2FrYXRpLmhlYWQoKSkKCgpsZSA9IExhYmVsRW5jb2RlcigpCnkgPSBsZS5maXRfdHJhbnNmb3JtKGRmLnRvcGljKQoKCnByaW50KGxlLmNsYXNzZXNfKQpwcmludChsZS50cmFuc2Zvcm0oWyJjb21wdXRlcl9ncmFwaGljcyJdKSkKcHJpbnQobGUudHJhbnNmb3JtKFsib3BlcmF0aW5nX3N5c3RlbXMiXSkpCgoKClhfdHJhaW4sIFhfdGVzdCwgeV90cmFpbiwgeV90ZXN0ID0gdHJhaW5fdGVzdF9zcGxpdCgKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkZi5ib2R5X3dha2F0aSwgIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHksIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRlc3Rfc2l6ZT0wLjIsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHJhbmRvbV9zdGF0ZT0xMCwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgc2h1ZmZsZT1UcnVlCikKCgpjb3VudF92ZWN0ID0gQ291bnRWZWN0b3JpemVyKCkKWF90cmFpbl9jb3VudHMgPSBjb3VudF92ZWN0LmZpdF90cmFuc2Zvcm0oZGYuYm9keV93YWthdGkpCnRmX3RyYW5zZm9ybWVyID0gVGZpZGZUcmFuc2Zvcm1lcih1c2VfaWRmPUZhbHNlKS5maXQoWF90cmFpbl9jb3VudHMpClhfdHJhaW5fdGYgPSB0Zl90cmFuc2Zvcm1lci50cmFuc2Zvcm0oWF90cmFpbl9jb3VudHMpCnByaW50KFhfdHJhaW5fdGYpCnByaW50KFhfdHJhaW5fdGYuc2hhcGUpCgoKZnJvbSBza2xlYXJuLmJhc2UgaW1wb3J0IEJhc2VFc3RpbWF0b3IsIFRyYW5zZm9ybWVyTWl4aW4KZnJvbSBza2xlYXJuLm1ldHJpY3MgaW1wb3J0IGNsYXNzaWZpY2F0aW9uX3JlcG9ydCwgY29uZnVzaW9uX21hdHJpeApjbGFzcyBSdWxlYmFzZWRFc3RpbWF0b3IoQmFzZUVzdGltYXRvciwgVHJhbnNmb3JtZXJNaXhpbik6CiAgICBkZWYgX19pbml0X18oc2VsZiwgbGFiZWxfZW5jb2Rlcik6CiAgICAgICAgc2VsZi5sZSA9IGxhYmVsX2VuY29kZXIKICAKICAgIGRlZiBmaXQoc2VsZiwgWCwgeSk6CiAgICAgICAgcmV0dXJuIHNlbGYKCiAgICBkZWYgcHJlZGljdChzZWxmLCBYKToKICAgICAgICAiIiLjg6vjg7zjg6vjgpLoqJjov7AiIiIKICAgICAgICByZXN1bHQgPSBbXQogICAgICAgIGZvciB0ZXh0IGluIFg6CiAgICAgICAgICAgIHByZWQgPSAwCiAgICAgICAgICAgIGlmIHJlLnNlYXJjaChyIijjgrPjg7Pjg5Tjg6Xjg7zjgr9844Kw44Op44OV44Kj44OD44Kv44K5fGNvbXB1dGVyfGdyYXBoaWNzKSIsIHRleHQpOgogICAgICAgICAgICAgICAgcHJlZCA9IHNlbGYubGUudHJhbnNmb3JtKFsiY29tcHV0ZXJfZ3JhcGhpY3MiXSlbMF0KICAgICAgICAgICAgZWxpZiByZS5zZWFyY2gociIo44Kq44Oa44Os44O844OG44Kj44Oz44KwfOOCt+OCueODhuODoHxvcGVyYXRpbmd8c3lzdGVtKSIsIHRleHQpOgogICAgICAgICAgICAgICAgcHJlZCA9IHNlbGYubGUudHJhbnNmb3JtKFsib3BlcmF0aW5nX3N5c3RlbXMiXSlbMF0KICAgICAgICAgICAgZWxpZiByZS5zZWFyY2gociIo5pqX5Y+3fOOCu+OCreODpeODquODhuOCo3xjb21wdXRlcnxzZWN1cml0eSkiLCB0ZXh0KToKICAgICAgICAgICAgICAgIHByZWQgPSBzZWxmLmxlLnRyYW5zZm9ybShbImNvbXB1dGVyX3NlY3VyaXR5Il0pWzBdCiAgICAgICAgICAgIGVsaWYgcmUuc2VhcmNoKHIiKOOCouODl+ODquOCseODvOOCt+ODp+ODs3zjgrXjg7zjg5PjgrkpIiwgdGV4dCk6CiAgICAgICAgICAgICAgICBwcmVkID0gc2VsZi5sZS50cmFuc2Zvcm0oWyJhcHBsaWNhdGlvbl9zZXJ2aWNlIl0pWzBdCiAgICAgICAgICAgIGVsaWYgcmUuc2VhcmNoKHIiKOOCs+ODs+ODlOODpeODvOOCv3zjgr3jg5Xjg4jjgqbjgqfjgqJ8Y29tcHV0ZXJ8c29mdHdhcmUpIiwgdGV4dCk6CiAgICAgICAgICAgICAgICBwcmVkID0gc2VsZi5sZS50cmFuc2Zvcm0oWyJjb21wdXRlcl9zb2Z0d2FyZSJdKVswXQogICAgICAgICAgICBlbGlmIHJlLnNlYXJjaChyIijkurrlt6V855+l6IO9KSIsIHRleHQpOgogICAgICAgICAgICAgICAgcHJlZCA9IHNlbGYubGUudHJhbnNmb3JtKFsiYXJ0aWZpY2lhbF9pbnRlbGxpZ2VuY2UiXSlbMF0KICAgICAgICAgICAgZWxpZiByZS5zZWFyY2gociIo5qSc57SifOOCqOODs+OCuOODsykiLCB0ZXh0KToKICAgICAgICAgICAgICAgIHByZWQgPSBzZWxmLmxlLnRyYW5zZm9ybShbInNlYXJjaF9lbmdpbmUiXSlbMF0KICAgICAgICAgICAgZWxpZiByZS5zZWFyY2gociIo5oOF5aCxfOekvuS8mikiLCB0ZXh0KToKICAgICAgICAgICAgICAgIHByZWQgPSBzZWxmLmxlLnRyYW5zZm9ybShbImluZm9ybWF0aW9uX3NvY2lldHkiXSlbMF0KICAgICAgICAgICAgcmVzdWx0LmFwcGVuZChwcmVkKQogICAgICAgIHJldHVybiByZXN1bHQKCgpydWxlYmFzZWQgPSBSdWxlYmFzZWRFc3RpbWF0b3IobGFiZWxfZW5jb2Rlcj1sZSkKcnVsZWJhc2VkX3ByZWQgPSBydWxlYmFzZWQucHJlZGljdChYX3Rlc3QpCgoKZnJvbSBza2xlYXJuLm1ldHJpY3MgaW1wb3J0IGNvbmZ1c2lvbl9tYXRyaXgKY29uZnVzaW9uX21hdHJpeCh5X3Rlc3QscnVsZWJhc2VkX3ByZWQpCgpwcmludChjbGFzc2lmaWNhdGlvbl9yZXBvcnQoeV90ZXN0LCBydWxlYmFzZWRfcHJlZCwgdGFyZ2V0X25hbWVzPWxlLmNsYXNzZXNfKSkKCmZyb20gc2tsZWFybi5kZWNvbXBvc2l0aW9uIGltcG9ydCBQQ0EKZnJvbSBza2xlYXJuLnByZXByb2Nlc3NpbmcgaW1wb3J0IFN0YW5kYXJkU2NhbGVyCgpmcm9tIHNrbGVhcm4ubGluZWFyX21vZGVsIGltcG9ydCBTR0RDbGFzc2lmaWVyCiNwY2EgPSBtYWtlX3BpcGVsaW5lKFN0YW5kYXJkU2NhbGVyKCksUENBKG5fY29tcG9uZW50cz0yLCByYW5kb21fc3RhdGU9MCkpCgpmcm9tIHNrbGVhcm4uZGVjb21wb3NpdGlvbiBpbXBvcnQgVHJ1bmNhdGVkU1ZECgpzdmQgPSBUcnVuY2F0ZWRTVkQobl9jb21wb25lbnRzPTE2OTMgLCBuX2l0ZXI9NywgcmFuZG9tX3N0YXRlPTQyKQoKCnRleHRfY2xmID0gUGlwZWxpbmUoWwogICAgKCd0ZmlkZicsIFRmaWRmVmVjdG9yaXplcigpKSwKICAgICgnc3RkJyxTdGFuZGFyZFNjYWxlcigpKSwKICAgICgnUENBJywgVHJ1bmNhdGVkU1ZEKG5fY29tcG9uZW50cz0xNjkzICwgbl9pdGVyPTcsIHJhbmRvbV9zdGF0ZT00MikpLAogICAgKCdjbGYnLCBTR0RDbGFzc2lmaWVyKGxvc3M9J2hpbmdlJywgcGVuYWx0eT0nbDInLAogICAgICAgICAgICAgICAgICAgICAgICAgIGFscGhhPTFlLTMsIHJhbmRvbV9zdGF0ZT05MCwKICAgICAgICAgICAgICAgICAgICAgICAgICBtYXhfaXRlcj0xMDAsIHRvbD1Ob25lKSksCl0pCgp0ZXh0X2NsZi5maXQoWF90cmFpbix5X3RyYWluKQoKcHJlZGljdGVkID0gdGV4dF9jbGYucHJlZGljdChYX3Rlc3QpCgoKY29uZnVzaW9uX21hdHJpeCh5X3Rlc3QscHJlZGljdGVkKQoKcHJpbnQoY2xhc3NpZmljYXRpb25fcmVwb3J0KHlfdGVzdCwgcHJlZGljdGVkLCB0YXJnZXRfbmFtZXM9bGUuY2xhc3Nlc18pKQ==
compilation info
Traceback (most recent call last):
File "/usr/lib/python3.7/py_compile.py", line 143, in compile
_optimize=optimize)
File "<frozen importlib._bootstrap_external>", line 791, in source_to_code
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "./prog.py", line 2
!wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
^
SyntaxError: invalid syntax
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.7/py_compile.py", line 147, in compile
raise py_exc
py_compile.PyCompileError: File "./prog.py", line 2
!wget http://www.cs.gunma-u.ac.jp/~michi/toku2/book_category.tgz
^
SyntaxError: invalid syntax
stdout