fork download
  1. import java.util.HashSet;
  2. import java.util.Map;
  3. import java.util.Scanner;
  4. import java.util.Set;
  5. import java.util.StringTokenizer;
  6. import java.util.TreeMap;
  7.  
  8. public class Main
  9. {
  10. //==========================================================================
  11. // Interface
  12. //==========================================================================
  13.  
  14. public interface FrequencyVocabulary
  15. {
  16. public class Statistic
  17. {
  18. public int count;
  19. public double frequency;
  20. public Statistic()
  21. {
  22. count = 0;
  23. frequency = 0;
  24. }
  25. }
  26.  
  27. public TreeMap<String,Statistic> getMap();
  28. public String selectMaxFreqToken( Set<String> tokens );
  29. }
  30.  
  31. //==========================================================================
  32. // Implementation
  33. //==========================================================================
  34.  
  35. public class FrequencyVocabularyImpl implements FrequencyVocabulary
  36. {
  37. TreeMap<String, Statistic> map;
  38.  
  39. public FrequencyVocabularyImpl(String text, String delimiters)
  40. {
  41. map = new TreeMap<String, Statistic>();
  42. int totalCount = 0;
  43.  
  44. // first we parse text to tokens (words) and fill the map where keys are words and values - statistic for given word
  45. StringTokenizer st = new StringTokenizer(text, delimiters);
  46. while (st.hasMoreTokens())
  47. {
  48. String key = st.nextToken();
  49. if ( containsDigits(key) )
  50. {
  51. continue;
  52. }
  53. Statistic value = map.get(key);
  54. if (null != value)
  55. {
  56. value.count++;
  57. map.replace(key, value);
  58. }
  59. else
  60. {
  61. value = new Statistic();
  62. value.count = 1;
  63. map.put( key, value );
  64. }
  65. totalCount += value.count;
  66. }
  67.  
  68. // now we can calculate and set frequency field for all the entries
  69. for (Map.Entry<String, Statistic> entry : map.entrySet())
  70. {
  71. if ( entry.getValue().count != 0 )
  72. {
  73. entry.getValue().frequency = ((double)entry.getValue().count) / totalCount;
  74. }
  75. }
  76. }
  77.  
  78. public TreeMap<String, Statistic> getMap()
  79. {
  80. return map;
  81. }
  82.  
  83. // select token with max frequency from the given set of tokens
  84. public String selectMaxFreqToken( Set<String> tokens )
  85. {
  86. String maxFreqToken = null;
  87. Statistic maxFreqStat = null;
  88. for( String token : tokens )
  89. {
  90. Statistic stat = map.get(token);
  91. if ( stat != null )
  92. {
  93. if ( maxFreqToken == null )
  94. {
  95. maxFreqToken = token;
  96. maxFreqStat = stat;
  97. }
  98. else
  99. {
  100. if ( stat.count > maxFreqStat.count )
  101. {
  102. maxFreqToken = token;
  103. maxFreqStat = stat;
  104. }
  105. }
  106. }
  107. }
  108.  
  109. return maxFreqToken;
  110. }
  111.  
  112. public boolean containsDigits( String token )
  113. {
  114. if ( token.indexOf('0') >= 0 ||
  115. token.indexOf('1') >= 0 ||
  116. token.indexOf('2') >= 0 ||
  117. token.indexOf('3') >= 0 ||
  118. token.indexOf('4') >= 0 ||
  119. token.indexOf('5') >= 0 ||
  120. token.indexOf('6') >= 0 ||
  121. token.indexOf('7') >= 0 ||
  122. token.indexOf('8') >= 0 ||
  123. token.indexOf('9') >= 0
  124. )
  125. {
  126. return true;
  127. }
  128. return false;
  129. }
  130. }
  131.  
  132. //==========================================================================
  133. // Testing
  134. //==========================================================================
  135.  
  136. public static void main(String[] args)
  137. {
  138. String text = "";
  139.  
  140. Scanner in = new Scanner(System.in);
  141.  
  142. for(;;)
  143. {
  144. if ( ! in.hasNextLine() ) break;
  145. String inputLine = in.nextLine();
  146. if ( inputLine == null || inputLine.equals("%EOF%") ) break;
  147. text = text + inputLine + "\n";
  148. }
  149. in.close();
  150.  
  151. if ( text.isEmpty() )
  152. {
  153. System.out.println("*** Empty text ****");
  154. return;
  155. }
  156.  
  157. //System.out.println("*** DBG:input text is: <"+text+">");
  158.  
  159. Main obj = new Main();
  160.  
  161. FrequencyVocabulary fv = obj.new FrequencyVocabularyImpl( text, " ?.,;:-+\n\t{}[]=()<>*&%$#@\"\'`~!" );
  162.  
  163. TreeMap<String,FrequencyVocabulary.Statistic> statMap = fv.getMap();
  164. System.out.println("Word frequency statistics:\n");
  165. for (Map.Entry<String, FrequencyVocabulary.Statistic> statEntry : statMap.entrySet())
  166. {
  167. System.out.printf("%s\t,\t%d,\t%f\n",
  168. statEntry.getKey(),
  169. statEntry.getValue().count,
  170. statEntry.getValue().frequency
  171. );
  172. }
  173. System.out.println("--------------------------------------\n");
  174.  
  175. HashSet<String> tokens = new HashSet<String>();
  176. tokens.add("for");
  177. tokens.add("do");
  178. tokens.add("to");
  179. tokens.add("else");
  180. tokens.add("while");
  181. tokens.add("if");
  182. String caption = "Select max frequent token from: ";
  183. for( String s : tokens )
  184. {
  185. caption += " " + s;
  186. }
  187. System.out.println(caption);
  188. String s = fv.selectMaxFreqToken(tokens);
  189. if ( s == null ) s = "<null>";
  190. System.out.println("=> selected is: " + s );
  191. }
  192. }
Success #stdin #stdout 0.24s 321408KB
stdin
To be, or not to be: that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles,
And by opposing end them? To die: to sleep;
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to, 'tis a consummation
Devoutly to be wish'd. To die, to sleep;
To sleep: perchance to dream: ay, there's the rub;
For in that sleep of death what dreams may come
When we have shuffled off this mortal coil,
Must give us pause: there's the respect
That makes calamity of so long life;
For who would bear the whips and scorns of time,
The oppressor's wrong, the proud man's contumely,
The pangs of despised love, the law's delay,
The insolence of office and the spurns
That patient merit of the unworthy takes,
When he himself might his quietus make
With a bare bodkin? who would fardels bear,
To grunt and sweat under a weary life,
But that the dread of something after death,
The undiscover'd country from whose bourn
No traveller returns, puzzles the will
And makes us rather bear those ills we have
Than fly to others that we know not of?
Thus conscience does make cowards of us all;
And thus the native hue of resolution
Is sicklied o'er with the pale cast of thought,
And enterprises of great pith and moment
With this regard their currents turn awry,
And lose the name of action. - Soft you now!
The fair Ophelia! Nymph, in thy orisons
Be all my sins remember'd.

%EOF%
stdout
Word frequency statistics:

And	,	5,	0.007321
Be	,	1,	0.001464
But	,	1,	0.001464
Devoutly	,	1,	0.001464
For	,	2,	0.002928
Is	,	1,	0.001464
Must	,	1,	0.001464
No	,	2,	0.002928
Nymph	,	1,	0.001464
Ophelia	,	1,	0.001464
Or	,	1,	0.001464
Soft	,	1,	0.001464
Than	,	1,	0.001464
That	,	3,	0.004392
The	,	7,	0.010249
Thus	,	1,	0.001464
To	,	5,	0.007321
When	,	2,	0.002928
Whether	,	1,	0.001464
With	,	2,	0.002928
a	,	5,	0.007321
ache	,	1,	0.001464
action	,	1,	0.001464
after	,	1,	0.001464
against	,	1,	0.001464
all	,	2,	0.002928
and	,	7,	0.010249
arms	,	1,	0.001464
arrows	,	1,	0.001464
awry	,	1,	0.001464
ay	,	1,	0.001464
bare	,	1,	0.001464
be	,	3,	0.004392
bear	,	3,	0.004392
bodkin	,	1,	0.001464
bourn	,	1,	0.001464
by	,	2,	0.002928
calamity	,	1,	0.001464
cast	,	1,	0.001464
coil	,	1,	0.001464
come	,	1,	0.001464
conscience	,	1,	0.001464
consummation	,	1,	0.001464
contumely	,	1,	0.001464
country	,	1,	0.001464
cowards	,	1,	0.001464
currents	,	1,	0.001464
d	,	3,	0.004392
death	,	2,	0.002928
delay	,	1,	0.001464
despised	,	1,	0.001464
die	,	2,	0.002928
does	,	1,	0.001464
dread	,	1,	0.001464
dream	,	1,	0.001464
dreams	,	1,	0.001464
end	,	2,	0.002928
enterprises	,	1,	0.001464
er	,	1,	0.001464
fair	,	1,	0.001464
fardels	,	1,	0.001464
flesh	,	1,	0.001464
fly	,	1,	0.001464
fortune	,	1,	0.001464
from	,	1,	0.001464
give	,	1,	0.001464
great	,	1,	0.001464
grunt	,	1,	0.001464
have	,	2,	0.002928
he	,	1,	0.001464
heart	,	1,	0.001464
heir	,	1,	0.001464
himself	,	1,	0.001464
his	,	1,	0.001464
hue	,	1,	0.001464
ills	,	1,	0.001464
in	,	3,	0.004392
insolence	,	1,	0.001464
is	,	2,	0.002928
know	,	1,	0.001464
law	,	1,	0.001464
life	,	2,	0.002928
long	,	1,	0.001464
lose	,	1,	0.001464
love	,	1,	0.001464
make	,	2,	0.002928
makes	,	2,	0.002928
man	,	1,	0.001464
may	,	1,	0.001464
merit	,	1,	0.001464
might	,	1,	0.001464
mind	,	1,	0.001464
moment	,	1,	0.001464
more	,	1,	0.001464
mortal	,	1,	0.001464
my	,	1,	0.001464
name	,	1,	0.001464
native	,	1,	0.001464
natural	,	1,	0.001464
nobler	,	1,	0.001464
not	,	2,	0.002928
now	,	1,	0.001464
o	,	1,	0.001464
of	,	15,	0.021962
off	,	1,	0.001464
office	,	1,	0.001464
opposing	,	1,	0.001464
oppressor	,	1,	0.001464
or	,	1,	0.001464
orisons	,	1,	0.001464
others	,	1,	0.001464
outrageous	,	1,	0.001464
pale	,	1,	0.001464
pangs	,	1,	0.001464
patient	,	1,	0.001464
pause	,	1,	0.001464
perchance	,	1,	0.001464
pith	,	1,	0.001464
proud	,	1,	0.001464
puzzles	,	1,	0.001464
question	,	1,	0.001464
quietus	,	1,	0.001464
rather	,	1,	0.001464
regard	,	1,	0.001464
remember	,	1,	0.001464
resolution	,	1,	0.001464
respect	,	1,	0.001464
returns	,	1,	0.001464
rub	,	1,	0.001464
s	,	5,	0.007321
say	,	1,	0.001464
scorns	,	1,	0.001464
sea	,	1,	0.001464
shocks	,	1,	0.001464
shuffled	,	1,	0.001464
sicklied	,	1,	0.001464
sins	,	1,	0.001464
sleep	,	5,	0.007321
slings	,	1,	0.001464
so	,	1,	0.001464
something	,	1,	0.001464
spurns	,	1,	0.001464
suffer	,	1,	0.001464
sweat	,	1,	0.001464
take	,	1,	0.001464
takes	,	1,	0.001464
that	,	4,	0.005857
the	,	15,	0.021962
their	,	1,	0.001464
them	,	1,	0.001464
there	,	2,	0.002928
this	,	2,	0.002928
those	,	1,	0.001464
thought	,	1,	0.001464
thousand	,	1,	0.001464
thus	,	1,	0.001464
thy	,	1,	0.001464
time	,	1,	0.001464
tis	,	2,	0.002928
to	,	10,	0.014641
traveller	,	1,	0.001464
troubles	,	1,	0.001464
turn	,	1,	0.001464
under	,	1,	0.001464
undiscover	,	1,	0.001464
unworthy	,	1,	0.001464
us	,	3,	0.004392
we	,	4,	0.005857
weary	,	1,	0.001464
what	,	1,	0.001464
whips	,	1,	0.001464
who	,	2,	0.002928
whose	,	1,	0.001464
will	,	1,	0.001464
wish	,	1,	0.001464
with	,	1,	0.001464
would	,	2,	0.002928
wrong	,	1,	0.001464
you	,	1,	0.001464
--------------------------------------

Select max frequent token from:  else for do to while if
=> selected is: to