fork(162) download
  1. import pandas as pd
  2. import os
  3. from nltk.corpus import stopwords
  4. import nltk.data
  5. import logging
  6. import numpy as np # Make sure that numpy is imported
  7. from gensim.models import Word2Vec
  8.  
  9.  
  10. from KaggleWord2VecUtility import KaggleWord2VecUtility
  11.  
  12.  
  13. if __name__ == '__main__':
  14.  
  15. # Read data from files
  16. train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3)
  17. test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'testData.tsv'), header=0, delimiter="\t", quoting=3 )
  18. unlabeled_train = pd.read_csv( os.path.join(os.path.dirname(__file__), "unlabeledTrainData.tsv"), header=0,delimiter="\t", quoting=3 )
  19.  
  20. # Verify the number of reviews that were read (100,000 in total)
  21. print "Read %d labeled train reviews, %d labeled test reviews, " \
  22. "and %d unlabeled reviews\n" % (train["review"].size,
  23. test["review"].size, unlabeled_train["review"].size )
  24.  
  25.  
  26.  
  27. # Load the punkt tokenizer
  28. tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
  29.  
  30.  
  31.  
  32. # ****** Split the labeled and unlabeled training sets into clean sentences
  33. #
  34. sentences = [] # Initialize an empty list of sentences
  35.  
  36. print "Parsing sentences from training set"
  37. for review in train["review"]:
  38. sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
  39.  
  40. print "Parsing sentences from unlabeled set"
  41. for review in unlabeled_train["review"]:
  42. sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
  43.  
  44. # ****** Set parameters and train the word2vec model
  45. #
  46. # Import the built-in logging module and configure it so that Word2Vec
  47. # creates nice output messages
  48. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
  49. level=logging.INFO)
  50.  
  51. # Set values for various parameters
  52. num_features = 300 # Word vector dimensionality
  53. min_word_count = 40 # Minimum word count
  54. num_workers = 4 # Number of threads to run in parallel
  55. context = 10 # Context window size
  56. downsampling = 1e-3 # Downsample setting for frequent words
  57.  
  58. # Initialize and train the model (this will take some time)
  59. print "Training Word2Vec model..."
  60. model = Word2Vec(sentences, workers=num_workers, \
  61. size=num_features, min_count = min_word_count, \
  62. window = context, sample = downsampling, seed=1)
  63.  
  64. # If you don't plan to train the model any further, calling
  65. # init_sims will make the model much more memory-efficient.
  66. model.init_sims(replace=True)
  67.  
  68. # It can be helpful to create a meaningful model name and
  69. # save the model for later use. You can load it later using Word2Vec.load()
  70. model_name = "300features_40minwords_10context.txt"
  71. model.save(model_name)
  72.  
  73. model_name = "300features_40minwords_10context.bin"
  74. model.save(model_name)
  75.  
  76. print model.doesnt_match("man woman child kitchen".split())
  77. model.doesnt_match("france england germany berlin".split())
  78. model.doesnt_match("paris berlin london austria".split())
  79. model.most_similar("man")
  80. model.most_similar("queen")
  81. model.most_similar("awful")
Runtime error #stdin #stdout #stderr 0s 7848KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 1, in <module>
ImportError: No module named pandas