fork download
  1. #Multi-word Queries
  2.  
  3. #Triple Gold Star
  4.  
  5. #For this question, your goal is to modify the search engine to be able to
  6. #handle multi-word queries. To do this, we need to make two main changes:
  7.  
  8. # 1. Modify the index to keep track of not only the URL, but the position
  9. # within that page where a word appears.
  10.  
  11. # 2. Make a version of the lookup procedure that takes a list of target
  12. # words, and only counts a URL as a match if it contains all of the target
  13. # words, adjacent to each other, in the order they are given in the input.
  14.  
  15. #For example, if the search input is "Monty Python", it should match a page that
  16. #contains, "Monty Python is funny!", but should not match a page containing
  17. #"Monty likes the Python programming language." The words must appear in the
  18. #same order, and the next word must start right after the end of the previous
  19. #word.
  20.  
  21. #Modify the search engine code to support multi-word queries. Your modified code
  22. #should define these two procedures:
  23.  
  24. # crawl_web(seed) => index, graph
  25. # A modified version of crawl_web that produces an index that includes
  26. # positional information. It is up to you to figure out how to represent
  27. # positions in your index and you can do this any way you want. Whatever
  28. # index you produce is the one we will pass into your multi_lookup(index,
  29. # keyword) procedure.
  30.  
  31. # multi_lookup(index, list of keywords) => list of URLs
  32. # A URL should be included in the output list, only if it contains all of
  33. # the keywords in the input list, next to each other.
  34.  
  35.  
  36. def multi_lookup(index, keywords):
  37. result = []
  38. if not keywords:
  39. return result
  40. current = lookup(index, keywords[0])
  41. for keyword in keywords[1:]:
  42. valid = []
  43. next = lookup(index, keyword)
  44. if next:
  45. for url, pos in current:
  46. if [url, pos + 1] in next:
  47. valid.append([url, pos + 1])
  48. current = valid
  49. for url, pos in current:
  50. result.append(url)
  51. return result
  52.  
  53. def crawl_web(seed): # returns index, graph of inlinks
  54. tocrawl = [seed]
  55. crawled = []
  56. graph = {} # <url>, [list of pages it links to]
  57. index = {}
  58. while tocrawl:
  59. page = tocrawl.pop()
  60. if page not in crawled:
  61. content = get_page(page)
  62. add_page_to_index(index, page, content)
  63. outlinks = get_all_links(content)
  64. graph[page] = outlinks
  65. union(tocrawl, outlinks)
  66. crawled.append(page)
  67. return index, graph
  68.  
  69.  
  70. def get_next_target(page):
  71. start_link = page.find('<a href=')
  72. if start_link == -1:
  73. return None, 0
  74. start_quote = page.find('"', start_link)
  75. end_quote = page.find('"', start_quote + 1)
  76. url = page[start_quote + 1:end_quote]
  77. return url, end_quote
  78.  
  79. def get_all_links(page):
  80. links = []
  81. while True:
  82. url, endpos = get_next_target(page)
  83. if url:
  84. links.append(url)
  85. page = page[endpos:]
  86. else:
  87. break
  88. return links
  89.  
  90.  
  91. def union(a, b):
  92. for e in b:
  93. if e not in a:
  94. a.append(e)
  95.  
  96. def add_page_to_index(index, url, content):
  97. words = content.split()
  98. for position, word in enumerate(words):
  99. add_to_index(index, word, url, position)
  100.  
  101. def add_to_index(index, keyword, url, position):
  102. if keyword in index:
  103. index[keyword].append([url, position])
  104. else:
  105. index[keyword] = [[url, position]]
  106.  
  107. def lookup(index, keyword):
  108. if keyword in index:
  109. return index[keyword]
  110. else:
  111. return None
  112.  
  113.  
  114.  
  115.  
  116. cache = {
  117. 'http://w...content-available-to-author-only...y.com/cs101x/final/multi.html': """<html>
  118. <body>
  119.  
  120. <a href="http://w...content-available-to-author-only...y.com/cs101x/final/a.html">A</a><br>
  121. <a href="http://w...content-available-to-author-only...y.com/cs101x/final/b.html">B</a><br>
  122.  
  123. </body>
  124. """,
  125. 'http://w...content-available-to-author-only...y.com/cs101x/final/b.html': """<html>
  126. <body>
  127.  
  128. Monty likes the Python programming language
  129. Thomas Jefferson founded the University of Virginia
  130. When Mandela was in London, he visited Nelson's Column.
  131.  
  132. </body>
  133. </html>
  134. """,
  135. 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html': """<html>
  136. <body>
  137.  
  138. Monty Python is not about a programming language
  139. Udacity was not founded by Thomas Jefferson
  140. Nelson Mandela said "Education is the most powerful weapon which you can
  141. use to change the world."
  142. </body>
  143. </html>
  144. """,
  145. }
  146.  
  147. def get_page(url):
  148. if url in cache:
  149. return cache[url]
  150. else:
  151. print "Page not in cache: " + url
  152. return None
  153.  
  154.  
  155.  
  156.  
  157.  
  158.  
  159. #Here are a few examples from the test site:
  160.  
  161. index, graph = crawl_web('http://w...content-available-to-author-only...y.com/cs101x/final/multi.html')
  162.  
  163. print multi_lookup(index, ['Python'])
  164. #>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
  165.  
  166. print multi_lookup(index, ['Monty', 'Python'])
  167. #>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
  168.  
  169. print multi_lookup(index, ['Python', 'programming', 'language'])
  170. #>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html']
  171.  
  172. print multi_lookup(index, ['Thomas', 'Jefferson'])
  173. #>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
  174.  
  175. print multi_lookup(index, ['most', 'powerful', 'weapon'])
  176. #>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
Success #stdin #stdout 0.02s 4676KB
stdin
Standard input is empty
stdout
['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
['http://w...content-available-to-author-only...y.com/cs101x/final/b.html']
['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']