#clasificador idiomas con naive bayes
def lista_palabras(texto):
palabras=[]
palabras_tmp=texto.lower().split()
for p in palabras_tmp:
if p not in palabras and len(p)>2:
palabras.append(p)
return palabras
def entrenar (textos):
c_palabras={}
c_categorias={}
c_textos=0
c_tot_palabras=0
#anadir al diccionario las categorias
for t in textos:
c_textos=c_textos+1
if t[1] not in c_categorias:
c_categorias[t[1]]=1
else:
c_categorias[t[1]]=c_categorias[t[1]]+1
#anadir palabras al diccionario
for t in textos:
palabras=lista_palabras(t[0])
for p in palabras:
if p not in c_palabras :
c_tot_palabras= c_tot_palabras +1
c_palabras [p]={}
for c in c_categorias:
c_palabras[p][c]=0
c_palabras [p][t[1]]= c_palabras [p][t[1]] +1
return (c_palabras, c_categorias, c_textos, c_tot_palabras)
def clasificar (texto, c_palabras, c_categorias, c_textos, c_tot_palabras):
categoria=""
prob_categoria=0
for c in c_categorias:
#probabilidad de la categoria
prob_c=float(c_categorias[c]) / float(c_textos)
palabras=lista_palabras(texto)
prob_total_c=prob_c
for p in palabras:
#probabilidad de la palabra
if p in c_palabras:
prob_p=float(c_palabras [p] [c] ) / float (c_tot_palabras)
# probabilidad P (categoria I palabra)
prob_cond=prob_p/prob_c
# probabilidad P (palabra I categoria)
prob=(prob_cond*prob_p) / prob_c
prob_total_c=prob_total_c*prob
if prob_categoria<prob_total_c:
categoria=c
prob_categoria=prob_total_c
return (categoria, prob_categoria)
if __name__ == "__main__":
textos=[
["Allez tout droit et prenez la prochaine rue", "frances"],
["Go straight and take the next street", "ingles"],
["Siga recto y gire en la siguiente calle", "espanol"],
["Intelligence Artificielle est une discipline tres interessante", "frances"],
["Artificial Intelligence is very interesting discipline", "ingles"],
["La Inteligencia Artificial es una disciplina muy interesante", "espanol"]
]
p,c,t,tp= entrenar(textos)
clase = clasificar("The straight lineto the Artificial Intelligence is a turn of our brain", p,c,t,tp)
#La recta a la Inteligencia Artificial es un giro de nuestro cerebro
#La ligne droite vers l'Intelligence Artificielle est un tour de notre cerveau
print ("Texto clasificado como: "+ str(clase))
I2NsYXNpZmljYWRvciBpZGlvbWFzIGNvbiBuYWl2ZSBiYXllcyAKZGVmIGxpc3RhX3BhbGFicmFzKHRleHRvKTogCiAgICBwYWxhYnJhcz1bXQogICAgcGFsYWJyYXNfdG1wPXRleHRvLmxvd2VyKCkuc3BsaXQoKSAKICAgIGZvciBwIGluIHBhbGFicmFzX3RtcDoKICAgICAgICBpZiBwIG5vdCBpbiBwYWxhYnJhcyBhbmQgbGVuKHApPjI6CiAgICAgICAgICAgIHBhbGFicmFzLmFwcGVuZChwKQogICAgcmV0dXJuICBwYWxhYnJhcwoKZGVmIGVudHJlbmFyICh0ZXh0b3MpOgogICAgY19wYWxhYnJhcz17fQogICAgY19jYXRlZ29yaWFzPXt9IAogICAgY190ZXh0b3M9MCAKICAgIGNfdG90X3BhbGFicmFzPTAKI2FuYWRpciBhbCBkaWNjaW9uYXJpbyBsYXMgY2F0ZWdvcmlhcwogICAgZm9yIHQgaW4gdGV4dG9zOgogICAgICAgIGNfdGV4dG9zPWNfdGV4dG9zKzEKICAgICAgICBpZiB0WzFdIG5vdCBpbiBjX2NhdGVnb3JpYXM6CiAgICAgICAgICAgIGNfY2F0ZWdvcmlhc1t0WzFdXT0xCiAgICAgICAgZWxzZTogCiAgICAgICAgIGNfY2F0ZWdvcmlhc1t0WzFdXT1jX2NhdGVnb3JpYXNbdFsxXV0rMQojYW5hZGlyIHBhbGFicmFzIGFsIGRpY2Npb25hcmlvCiAgICBmb3IgdCBpbiB0ZXh0b3M6CiAgICAgICAgcGFsYWJyYXM9bGlzdGFfcGFsYWJyYXModFswXSkKICAgICAgICBmb3IgcCBpbiBwYWxhYnJhczoKICAgICAgICAgICAgaWYgcCBub3QgaW4gY19wYWxhYnJhcyA6CiAgICAgICAgICAgICAgICBjX3RvdF9wYWxhYnJhcz0gY190b3RfcGFsYWJyYXMgKzEKICAgICAgICAgICAgICAgIGNfcGFsYWJyYXMgW3BdPXt9CiAgICAgICAgICAgICAgICBmb3IgYyBpbiBjX2NhdGVnb3JpYXM6CiAgICAgICAgICAgICAgICAgICAgY19wYWxhYnJhc1twXVtjXT0wCiAgICAgICAgY19wYWxhYnJhcyBbcF1bdFsxXV09IGNfcGFsYWJyYXMgW3BdW3RbMV1dICsxCiAgICByZXR1cm4gKGNfcGFsYWJyYXMsIGNfY2F0ZWdvcmlhcywgY190ZXh0b3MsIGNfdG90X3BhbGFicmFzKQpkZWYgY2xhc2lmaWNhciAodGV4dG8sIGNfcGFsYWJyYXMsIGNfY2F0ZWdvcmlhcywgY190ZXh0b3MsIGNfdG90X3BhbGFicmFzKToKICAgIGNhdGVnb3JpYT0iIgogICAgcHJvYl9jYXRlZ29yaWE9MAogICAgZm9yIGMgaW4gY19jYXRlZ29yaWFzOgogICAgICAgICNwcm9iYWJpbGlkYWQgZGUgbGEgY2F0ZWdvcmlhCiAgICAgICAgcHJvYl9jPWZsb2F0KGNfY2F0ZWdvcmlhc1tjXSkgLyBmbG9hdChjX3RleHRvcykKICAgICAgICBwYWxhYnJhcz1saXN0YV9wYWxhYnJhcyh0ZXh0bykKICAgICAgICBwcm9iX3RvdGFsX2M9cHJvYl9jCiAgICAgICAgZm9yIHAgaW4gcGFsYWJyYXM6CiAgICAjcHJvYmFiaWxpZGFkIGRlIGxhIHBhbGFicmEgCiAgICAgICAgIGlmIHAgaW4gY19wYWxhYnJhczoKICAgICAgICAgICAgcHJvYl9wPWZsb2F0KGNfcGFsYWJyYXMgW3BdIFtjXSApIC8gZmxvYXQgKGNfdG90X3BhbGFicmFzKQogICAgICAgICMgcHJvYmFiaWxpZGFkIFAgKGNhdGVnb3JpYSBJIHBhbGFicmEpCiAgICAgICAgcHJvYl9jb25kPXByb2JfcC9wcm9iX2MKICAgICAgICAjIHByb2JhYmlsaWRhZCBQIChwYWxhYnJhIEkgY2F0ZWdvcmlhKQogICAgICAgIHByb2I9KHByb2JfY29uZCpwcm9iX3ApIC8gcHJvYl9jCiAgICAgICAgcHJvYl90b3RhbF9jPXByb2JfdG90YWxfYypwcm9iCiAgICBpZiBwcm9iX2NhdGVnb3JpYTxwcm9iX3RvdGFsX2M6CiAgICAgICAgY2F0ZWdvcmlhPWMKICAgICAgICBwcm9iX2NhdGVnb3JpYT1wcm9iX3RvdGFsX2MKICAgIHJldHVybiAoY2F0ZWdvcmlhLCBwcm9iX2NhdGVnb3JpYSkKaWYgX19uYW1lX18gPT0gIl9fbWFpbl9fIjoKICAgIHRleHRvcz1bCiAgICAgICAgWyJBbGxleiB0b3V0IGRyb2l0IGV0IHByZW5leiBsYSBwcm9jaGFpbmUgcnVlIiwgImZyYW5jZXMiXSwKICAgICAgICBbIkdvIHN0cmFpZ2h0IGFuZCB0YWtlIHRoZSBuZXh0IHN0cmVldCIsICJpbmdsZXMiXSwKICAgICAgICBbIlNpZ2EgcmVjdG8geSBnaXJlIGVuIGxhIHNpZ3VpZW50ZSBjYWxsZSIsICJlc3Bhbm9sIl0sCiAgICAgICAgWyJJbnRlbGxpZ2VuY2UgQXJ0aWZpY2llbGxlIGVzdCB1bmUgZGlzY2lwbGluZSB0cmVzIGludGVyZXNzYW50ZSIsICJmcmFuY2VzIl0sCiAgICAgICAgWyJBcnRpZmljaWFsIEludGVsbGlnZW5jZSBpcyB2ZXJ5IGludGVyZXN0aW5nIGRpc2NpcGxpbmUiLCAiaW5nbGVzIl0sCiAgICAgICAgWyJMYSBJbnRlbGlnZW5jaWEgQXJ0aWZpY2lhbCBlcyB1bmEgZGlzY2lwbGluYSBtdXkgaW50ZXJlc2FudGUiLCAiZXNwYW5vbCJdCiAgICBdCiAgICBwLGMsdCx0cD0gZW50cmVuYXIodGV4dG9zKQogICAgY2xhc2UgPSBjbGFzaWZpY2FyKCJUaGUgc3RyYWlnaHQgbGluZXRvIHRoZSBBcnRpZmljaWFsIEludGVsbGlnZW5jZSBpcyBhIHR1cm4gb2Ygb3VyIGJyYWluIiwgcCxjLHQsdHApCiAgICAjTGEgcmVjdGEgYSBsYSBJbnRlbGlnZW5jaWEgQXJ0aWZpY2lhbCBlcyB1biBnaXJvIGRlIG51ZXN0cm8gY2VyZWJybwogICAgI0xhIGxpZ25lIGRyb2l0ZSB2ZXJzIGwnSW50ZWxsaWdlbmNlIEFydGlmaWNpZWxsZSBlc3QgdW4gdG91ciBkZSBub3RyZSBjZXJ2ZWF1CiAgICBwcmludCAoIlRleHRvIGNsYXNpZmljYWRvIGNvbW86ICIrIHN0cihjbGFzZSkp