fork(4) download
  1. #!/usr/bin/python3.9
  2. # -*- coding: UTF-8 -*-
  3. import unittest
  4. import re
  5.  
  6.  
  7. def count_occurences_in_text(word, text):
  8. pattern = r"(?<![a-z])((?<!')|(?<=''))"+str(word.lower())+"(?![a-z])((?!')|(?=''))"
  9. line_now = text.lower()
  10. count = 0
  11. search = re.search(pattern, line_now)
  12. while search:
  13. count +=1
  14. line_now = line_now[search.span()[1]:]
  15. search = re.search(pattern,line_now)
  16. return count
  17. # return count_occurrences(word, text)
  18.  
  19.  
  20. def count_occurrences(word, text):
  21. word = word.lower()
  22. text = text.lower()
  23. pattern = r"(?<![a-z])((?<!')|(?<=''))"+str(word)+"(?![a-z])((?!')|(?=''))"
  24. return len(re.findall(pattern, text))
  25.  
  26.  
  27. class TestCountOccurencesInText(unittest.TestCase):
  28. def test_count_occurences_in_text(self):
  29. """
  30. Test the count_occurences_in_text function
  31. """
  32. text = """Georges is my name and I like python. Oh ! your name is georges? And you like Python!
  33. Yes is is true, I like PYTHON
  34. and my name is GEORGES"""
  35. # test with a little text.
  36. self.assertEqual(3, count_occurences_in_text("Georges", text))
  37. self.assertEqual(3, count_occurences_in_text("GEORGES", text))
  38. self.assertEqual(3, count_occurences_in_text("georges", text))
  39. self.assertEqual(0, count_occurences_in_text("george", text))
  40. self.assertEqual(3, count_occurences_in_text("python", text))
  41. self.assertEqual(3, count_occurences_in_text("PYTHON", text))
  42. self.assertEqual(2, count_occurences_in_text("I", text))
  43. self.assertEqual(0, count_occurences_in_text("n", text))
  44. self.assertEqual(1, count_occurences_in_text("true", text))
  45. # regard ' as text:
  46. self.assertEqual(0, count_occurences_in_text("maley", "John O'maley is my friend"))
  47.  
  48. # Test it but with a BIG length file. (we once had a memory error with this...)
  49. text = """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy dog.""" * 500
  50. text += """The quick brown fox jump over the lazy dog.The quick brown Georges jump over the lazy dog."""
  51. text += """esrf sqfdg sfdglkj sdflgh sdflgjdsqrgl """ * 4000
  52. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy python."""
  53. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy dog.""" * 500
  54. text += """The quick brown fox jump over the lazy dog.The quick brown Georges jump over the lazy dog."""
  55. text += """esrf sqfdg sfdglkj sdflgh sdflgjdsqrgl """ * 4000
  56. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy python."""
  57. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy dog.""" * 500
  58. text += """The quick brown fox jump over the lazy dog.The quick brown Georges jump over the lazy dog."""
  59. text += """esrf sqfdg sfdglkj sdflgh sdflgjdsqrgl """ * 4000
  60. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy python."""
  61. text += """The quick brown fox jump over the true lazy dog.The quick brown fox jump over the lazy dog."""
  62. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy dog.""" * 500
  63. text += """ I vsfgsdfg sfdg sdfg sdgh sgh I sfdgsdf"""
  64. text += """The quick brown fox jump over the lazy dog.The quick brown fox jump over the lazy dog.""" * 500
  65.  
  66. self.assertEqual(3, count_occurences_in_text("Georges", text))
  67. self.assertEqual(3, count_occurences_in_text("GEORGES", text))
  68. self.assertEqual(3, count_occurences_in_text("georges", text))
  69. self.assertEqual(0, count_occurences_in_text("george", text))
  70. self.assertEqual(3, count_occurences_in_text("python", text))
  71. self.assertEqual(3, count_occurences_in_text("PYTHON", text))
  72. self.assertEqual(2, count_occurences_in_text("I", text))
  73. self.assertEqual(0, count_occurences_in_text("n", text))
  74. self.assertEqual(1, count_occurences_in_text("true", text))
  75. self.assertEqual(0, count_occurences_in_text("reflexion mirror", "I am a senior citizen and I live in the Fun-Plex 'Reflexion Mirror' in Sopchoppy, Florida"))
  76. self.assertEqual(1, count_occurences_in_text("'reflexion mirror'", "I am a senior citizen and I live in the Fun-Plex 'Reflexion Mirror' in Sopchoppy, Florida"))
  77. self.assertEqual(1, count_occurences_in_text("reflexion mirror", "I am a senior citizen and I live in the Fun-Plex (Reflexion Mirror) in Sopchoppy, Florida"))
  78. self.assertEqual(1, count_occurences_in_text("reflexion mirror", "Reflexion Mirror\" in Sopchoppy, Florida"))
  79. self.assertEqual(1, count_occurences_in_text("reflexion mirror", "I am a senior citizen and I live in the Fun-Plex «Reflexion Mirror» in Sopchoppy, Florida"))
  80. self.assertEqual(1, count_occurences_in_text("reflexion mirror", "I am a senior citizen and I live in the Fun-Plex \u201cReflexion Mirror\u201d in Sopchoppy, Florida"))
  81. self.assertEqual(1, count_occurences_in_text("legitimate",
  82. "who is approved by OILS is completely legitimate: their employees are of legal working age"))
  83. self.assertEqual(0, count_occurences_in_text("legitimate their",
  84. "who is approved by OILS is completely legitimate: their employees are of legal working age"))
  85. self.assertEqual(1, count_occurences_in_text("get back to me",
  86. "I hope you will consider this proposal, and get back to me as soon as possible"))
  87. self.assertEqual(1, count_occurences_in_text("skin-care",
  88. "enable Delavigne and its subsidiaries to create a skin-care monopoly"))
  89. self.assertEqual(1, count_occurences_in_text("skin-care monopoly",
  90. "enable Delavigne and its subsidiaries to create a skin-care monopoly"))
  91. self.assertEqual(0, count_occurences_in_text("skin-care monopoly in the US",
  92. "enable Delavigne and its subsidiaries to create a skin-care monopoly"))
  93. self.assertEqual(1, count_occurences_in_text("get back to me",
  94. "When you know:get back to me"))
  95. self.assertEqual(1, count_occurences_in_text("don't be left", """emergency alarm warning.
  96. Don't be left unprotected. Order your SSSS3000 today!"""))
  97. self.assertEqual(1, count_occurences_in_text("don", """emergency alarm warning.
  98. Don't be left unprotected. Order your don SSSS3000 today!"""))
  99. self.assertEqual(1, count_occurences_in_text("take that as a 'yes'",
  100. "Do I have to take that as a 'yes'?"))
  101. self.assertEqual(1, count_occurences_in_text("don't take that as a 'yes'",
  102. "I don't take that as a 'yes'?"))
  103. self.assertEqual(1, count_occurences_in_text("take that as a 'yes'","I don't take that as a 'yes'?"))
  104. self.assertEqual(1, count_occurences_in_text("don't", "I don't take that as a 'yes'?"))
  105. self.assertEqual(1, count_occurences_in_text("attaching my c.v. to this e-mail", "I am attaching my c.v. to this e-mail."))
  106. self.assertEqual(1, count_occurences_in_text("Linguist",
  107. "'''Linguist Specialist Found Dead on Laboratory Floor'''"))
  108. self.assertEqual(1, count_occurences_in_text("Linguist Specialist", "'''Linguist Specialist Found Dead on Laboratory Floor'''"))
  109. self.assertEqual(1, count_occurences_in_text("Laboratory Floor","'''Linguist Specialist Found Dead on Laboratory Floor'''"))
  110. self.assertEqual(1, count_occurences_in_text("Floor", "'''Linguist Specialist Found Dead on Laboratory Floor'''"))
  111. self.assertEqual(1, count_occurences_in_text("Floor", "''Linguist Specialist Found Dead on Laboratory Floor''"))
  112. self.assertEqual(1, count_occurences_in_text("Floor", "__Linguist Specialist Found Dead on Laboratory Floor__"))
  113. self.assertEqual(1, count_occurences_in_text("Floor","'''''Linguist Specialist Found Dead on Laboratory Floor'''''"))
  114. self.assertEqual(1, count_occurences_in_text("Linguist","'''Linguist Specialist Found Dead on Laboratory Floor'''"))
  115. self.assertEqual(1, count_occurences_in_text("Linguist", "''Linguist Specialist Found Dead on Laboratory Floor''"))
  116. self.assertEqual(1, count_occurences_in_text("Linguist", "__Linguist Specialist Found Dead on Laboratory Floor__"))
  117. self.assertEqual(1, count_occurences_in_text("Linguist","'''''Linguist Specialist Found Dead on Laboratory Floor'''''"))
  118. self.assertEqual(1, count_occurences_in_text("Floor","""Look: ''Linguist Specialist Found Dead on Laboratory Floor'' is the headline today."""))
  119.  
  120.  
  121. SAMPLE_TEXT_FOR_BENCH = """
  122. A Suggestion Box Entry from Bob Carter
  123.  
  124. Dear Anonymous,
  125.  
  126. I'm not quite sure I understand the concept of this 'Anonymous' Suggestion Box. If no one reads what we write, then how will anything ever
  127. change?
  128.  
  129. But in the spirit of good will, I've decided to offer my two cents, and hopefully Kevin won't steal it! (ha, ha). I would really like to
  130. see more varieties of coffee in the coffee machine in the break room. 'Milk and sugar', 'black with sugar', 'extra sugar' and 'cream and su
  131. gar' don't offer much diversity. Also, the selection of drinks seems heavily weighted in favor of 'sugar'. What if we don't want any suga
  132. r?
  133.  
  134. But all this is beside the point because I quite like sugar, to be honest. In fact, that's my second suggestion: more sugar in the office.
  135. Cakes, candy, insulin, aspartame... I'm not picky. I'll take it by mouth or inject it intravenously, if I have to.
  136.  
  137. Also, if someone could please fix the lock on the men's room stall, that would be helpful. Yesterday I was doing my business when Icarus ne
  138. arly climbed into my lap.
  139.  
  140. So, have a great day!
  141.  
  142. Anonymously,
  143. Bob Carter
  144. """
  145.  
  146.  
  147. def doit():
  148. """
  149. Run count_occurences_in_text on a few examples
  150. """
  151. i = 0
  152. for x in range(400):
  153. i += count_occurences_in_text("word", SAMPLE_TEXT_FOR_BENCH)
  154. i += count_occurences_in_text("sugar", SAMPLE_TEXT_FOR_BENCH)
  155. i += count_occurences_in_text("help", SAMPLE_TEXT_FOR_BENCH)
  156. i += count_occurences_in_text("heavily", SAMPLE_TEXT_FOR_BENCH)
  157. i += count_occurences_in_text("witfull", SAMPLE_TEXT_FOR_BENCH)
  158. i += count_occurences_in_text("dog", SAMPLE_TEXT_FOR_BENCH)
  159. i += count_occurences_in_text("almost", SAMPLE_TEXT_FOR_BENCH)
  160. i += count_occurences_in_text("insulin", SAMPLE_TEXT_FOR_BENCH)
  161. i += count_occurences_in_text("attaching", SAMPLE_TEXT_FOR_BENCH)
  162. i += count_occurences_in_text("asma", SAMPLE_TEXT_FOR_BENCH)
  163. i += count_occurences_in_text("neither", SAMPLE_TEXT_FOR_BENCH)
  164. i += count_occurences_in_text("won't", SAMPLE_TEXT_FOR_BENCH)
  165. i += count_occurences_in_text("green", SAMPLE_TEXT_FOR_BENCH)
  166. i += count_occurences_in_text("parabole", SAMPLE_TEXT_FOR_BENCH)
  167. print(i)
  168.  
  169.  
  170. # Start the tests
  171. if __name__ == '__main__':
  172. # I need to be fast as well:
  173. import profile
  174.  
  175. profile.run('doit()')
  176.  
  177. # I need to pass the test:
  178. unittest.main()
Runtime error #stdin #stdout #stderr 0.94s 11848KB
stdin
Standard input is empty
stdout
2000
         51252 function calls (50720 primitive calls) in 0.400 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1909    0.004    0.000    0.004    0.000 :0(append)
       14    0.000    0.000    0.000    0.000 :0(compile)
       84    0.000    0.000    0.000    0.000 :0(find)
      490    0.000    0.000    0.000    0.000 :0(isinstance)
       14    0.000    0.000    0.000    0.000 :0(items)
2444/2416    0.008    0.000    0.008    0.000 :0(len)
    11200    0.024    0.000    0.024    0.000 :0(lower)
       56    0.000    0.000    0.000    0.000 :0(max)
      336    0.000    0.000    0.000    0.000 :0(min)
      221    0.000    0.000    0.000    0.000 :0(ord)
       29    0.000    0.000    0.000    0.000 :0(range)
       28    0.000    0.000    0.000    0.000 :0(remove)
     7600    0.192    0.000    0.192    0.000 :0(search)
        1    0.000    0.000    0.000    0.000 :0(setprofile)
     2000    0.008    0.000    0.008    0.000 :0(span)
        1    0.000    0.000    0.400    0.400 <string>:1(<module>)
        1    0.000    0.000    0.400    0.400 profile:0(doit())
        0    0.000             0.000          profile:0(profiler)
        1    0.020    0.020    0.400    0.400 prog.py:147(doit)
     5600    0.060    0.000    0.380    0.000 prog.py:7(count_occurences_in_text)
     7600    0.040    0.000    0.288    0.000 re.py:143(search)
     7600    0.020    0.000    0.056    0.000 re.py:230(_compile)
       28    0.000    0.000    0.000    0.000 sre_compile.py:228(_compile_charset)
       28    0.000    0.000    0.000    0.000 sre_compile.py:256(_optimize_charset)
       14    0.000    0.000    0.000    0.000 sre_compile.py:433(_compile_info)
       28    0.000    0.000    0.000    0.000 sre_compile.py:546(isstring)
       14    0.000    0.000    0.012    0.001 sre_compile.py:552(_code)
       14    0.000    0.000    0.036    0.003 sre_compile.py:567(compile)
   182/14    0.008    0.000    0.012    0.001 sre_compile.py:64(_compile)
       84    0.000    0.000    0.000    0.000 sre_parse.py:138(__len__)
      448    0.000    0.000    0.000    0.000 sre_parse.py:142(__getitem__)
      333    0.000    0.000    0.000    0.000 sre_parse.py:150(append)
   140/56    0.000    0.000    0.000    0.000 sre_parse.py:152(getwidth)
       14    0.000    0.000    0.000    0.000 sre_parse.py:190(__init__)
      809    0.000    0.000    0.004    0.000 sre_parse.py:194(__next)
      798    0.000    0.000    0.000    0.000 sre_parse.py:207(match)
      543    0.000    0.000    0.004    0.000 sre_parse.py:213(get)
   126/14    0.008    0.000    0.024    0.002 sre_parse.py:336(_parse_sub)
   154/14    0.008    0.000    0.024    0.002 sre_parse.py:414(_parse)
       14    0.000    0.000    0.000    0.000 sre_parse.py:68(__init__)
       14    0.000    0.000    0.024    0.002 sre_parse.py:725(parse)
       28    0.000    0.000    0.004    0.000 sre_parse.py:75(opengroup)
       28    0.000    0.000    0.000    0.000 sre_parse.py:86(closegroup)
      182    0.000    0.000    0.000    0.000 sre_parse.py:93(__init__)


stderr
F
======================================================================
FAIL: test_count_occurences_in_text (__main__.TestCountOccurencesInText)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "prog.py", line 80, in test_count_occurences_in_text
AssertionError: 1 != 0

----------------------------------------------------------------------
Ran 1 test in 0.177s

FAILED (failures=1)