#Multi-word Queries
#Triple Gold Star
#For this question, your goal is to modify the search engine to be able to
#handle multi-word queries. To do this, we need to make two main changes:
# 1. Modify the index to keep track of not only the URL, but the position
# within that page where a word appears.
# 2. Make a version of the lookup procedure that takes a list of target
# words, and only counts a URL as a match if it contains all of the target
# words, adjacent to each other, in the order they are given in the input.
#For example, if the search input is "Monty Python", it should match a page that
#contains, "Monty Python is funny!", but should not match a page containing
#"Monty likes the Python programming language." The words must appear in the
#same order, and the next word must start right after the end of the previous
#word.
#Modify the search engine code to support multi-word queries. Your modified code
#should define these two procedures:
# crawl_web(seed) => index, graph
# A modified version of crawl_web that produces an index that includes
# positional information. It is up to you to figure out how to represent
# positions in your index and you can do this any way you want. Whatever
# index you produce is the one we will pass into your multi_lookup(index,
# keyword) procedure.
# multi_lookup(index, list of keywords) => list of URLs
# A URL should be included in the output list, only if it contains all of
# the keywords in the input list, next to each other.
def multi_lookup(index, keywords):
result = []
if not keywords:
return result
current = lookup(index, keywords[0])
for keyword in keywords[1:]:
valid = []
next = lookup(index, keyword)
if next:
for url, pos in current:
if [url, pos + 1] in next:
valid.append([url, pos + 1])
current = valid
for url, pos in current:
result.append(url)
return result
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def add_page_to_index(index, url, content):
words = content.split()
for position, word in enumerate(words):
add_to_index(index, word, url, position)
def add_to_index(index, keyword, url, position):
if keyword in index:
index[keyword].append([url, position])
else:
index[keyword] = [[url, position]]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
cache = {
'http://w...content-available-to-author-only...y.com/cs101x/final/multi.html': """<html>
<body>
<a href="http://w...content-available-to-author-only...y.com/cs101x/final/a.html">A</a><br>
<a href="http://w...content-available-to-author-only...y.com/cs101x/final/b.html">B</a><br>
</body>
""",
'http://w...content-available-to-author-only...y.com/cs101x/final/b.html': """<html>
<body>
Monty likes the Python programming language
Thomas Jefferson founded the University of Virginia
When Mandela was in London, he visited Nelson's Column.
</body>
</html>
""",
'http://w...content-available-to-author-only...y.com/cs101x/final/a.html': """<html>
<body>
Monty Python is not about a programming language
Udacity was not founded by Thomas Jefferson
Nelson Mandela said "Education is the most powerful weapon which you can
use to change the world."
</body>
</html>
""",
}
def get_page(url):
if url in cache:
return cache[url]
else:
print "Page not in cache: " + url
return None
#Here are a few examples from the test site:
index, graph = crawl_web('http://w...content-available-to-author-only...y.com/cs101x/final/multi.html')
print multi_lookup(index, ['Python'])
#>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
print multi_lookup(index, ['Monty', 'Python'])
#>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
print multi_lookup(index, ['Python', 'programming', 'language'])
#>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html']
print multi_lookup(index, ['Thomas', 'Jefferson'])
#>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/b.html', 'http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
print multi_lookup(index, ['most', 'powerful', 'weapon'])
#>>> ['http://w...content-available-to-author-only...y.com/cs101x/final/a.html']
I011bHRpLXdvcmQgUXVlcmllcwoKI1RyaXBsZSBHb2xkIFN0YXIKCiNGb3IgdGhpcyBxdWVzdGlvbiwgeW91ciBnb2FsIGlzIHRvIG1vZGlmeSB0aGUgc2VhcmNoIGVuZ2luZSB0byBiZSBhYmxlIHRvCiNoYW5kbGUgbXVsdGktd29yZCBxdWVyaWVzLiAgVG8gZG8gdGhpcywgd2UgbmVlZCB0byBtYWtlIHR3byBtYWluIGNoYW5nZXM6CgojICAgIDEuIE1vZGlmeSB0aGUgaW5kZXggdG8ga2VlcCB0cmFjayBvZiBub3Qgb25seSB0aGUgVVJMLCBidXQgdGhlIHBvc2l0aW9uCiMgICAgd2l0aGluIHRoYXQgcGFnZSB3aGVyZSBhIHdvcmQgYXBwZWFycy4KCiMgICAgMi4gTWFrZSBhIHZlcnNpb24gb2YgdGhlIGxvb2t1cCBwcm9jZWR1cmUgdGhhdCB0YWtlcyBhIGxpc3Qgb2YgdGFyZ2V0CiMgICAgd29yZHMsIGFuZCBvbmx5IGNvdW50cyBhIFVSTCBhcyBhIG1hdGNoIGlmIGl0IGNvbnRhaW5zIGFsbCBvZiB0aGUgdGFyZ2V0CiMgICAgd29yZHMsIGFkamFjZW50IHRvIGVhY2ggb3RoZXIsIGluIHRoZSBvcmRlciB0aGV5IGFyZSBnaXZlbiBpbiB0aGUgaW5wdXQuCgojRm9yIGV4YW1wbGUsIGlmIHRoZSBzZWFyY2ggaW5wdXQgaXMgIk1vbnR5IFB5dGhvbiIsIGl0IHNob3VsZCBtYXRjaCBhIHBhZ2UgdGhhdAojY29udGFpbnMsICJNb250eSBQeXRob24gaXMgZnVubnkhIiwgYnV0IHNob3VsZCBub3QgbWF0Y2ggYSBwYWdlIGNvbnRhaW5pbmcKIyJNb250eSBsaWtlcyB0aGUgUHl0aG9uIHByb2dyYW1taW5nIGxhbmd1YWdlLiIgIFRoZSB3b3JkcyBtdXN0IGFwcGVhciBpbiB0aGUKI3NhbWUgb3JkZXIsIGFuZCB0aGUgbmV4dCB3b3JkIG11c3Qgc3RhcnQgcmlnaHQgYWZ0ZXIgdGhlIGVuZCBvZiB0aGUgcHJldmlvdXMKI3dvcmQuCgojTW9kaWZ5IHRoZSBzZWFyY2ggZW5naW5lIGNvZGUgdG8gc3VwcG9ydCBtdWx0aS13b3JkIHF1ZXJpZXMuIFlvdXIgbW9kaWZpZWQgY29kZQojc2hvdWxkIGRlZmluZSB0aGVzZSB0d28gcHJvY2VkdXJlczoKCiMgICAgY3Jhd2xfd2ViKHNlZWQpID0+IGluZGV4LCBncmFwaAojICAgICAgICBBIG1vZGlmaWVkIHZlcnNpb24gb2YgY3Jhd2xfd2ViIHRoYXQgcHJvZHVjZXMgYW4gaW5kZXggdGhhdCBpbmNsdWRlcwojICAgICAgICBwb3NpdGlvbmFsIGluZm9ybWF0aW9uLiAgSXQgaXMgdXAgdG8geW91IHRvIGZpZ3VyZSBvdXQgaG93IHRvIHJlcHJlc2VudAojICAgICAgICBwb3NpdGlvbnMgaW4geW91ciBpbmRleCBhbmQgeW91IGNhbiBkbyB0aGlzIGFueSB3YXkgeW91IHdhbnQuICBXaGF0ZXZlcgojICAgICAgICBpbmRleCB5b3UgcHJvZHVjZSBpcyB0aGUgb25lIHdlIHdpbGwgcGFzcyBpbnRvIHlvdXIgbXVsdGlfbG9va3VwKGluZGV4LAojICAgICAgICBrZXl3b3JkKSBwcm9jZWR1cmUuCgojICAgIG11bHRpX2xvb2t1cChpbmRleCwgbGlzdCBvZiBrZXl3b3JkcykgPT4gbGlzdCBvZiBVUkxzCiMgICAgICAgIEEgVVJMIHNob3VsZCBiZSBpbmNsdWRlZCBpbiB0aGUgb3V0cHV0IGxpc3QsIG9ubHkgaWYgaXQgY29udGFpbnMgYWxsIG9mCiMgICAgICAgIHRoZSBrZXl3b3JkcyBpbiB0aGUgaW5wdXQgbGlzdCwgbmV4dCB0byBlYWNoIG90aGVyLgoKCmRlZiBtdWx0aV9sb29rdXAoaW5kZXgsIGtleXdvcmRzKToKICAgIHJlc3VsdCA9IFtdCiAgICBpZiBub3Qga2V5d29yZHM6CiAgICAgICAgcmV0dXJuIHJlc3VsdAogICAgY3VycmVudCA9IGxvb2t1cChpbmRleCwga2V5d29yZHNbMF0pCiAgICBmb3Iga2V5d29yZCBpbiBrZXl3b3Jkc1sxOl06CiAgICAgICAgdmFsaWQgPSBbXQogICAgICAgIG5leHQgPSBsb29rdXAoaW5kZXgsIGtleXdvcmQpCiAgICAgICAgaWYgbmV4dDoKICAgICAgICAgICAgZm9yIHVybCwgcG9zIGluIGN1cnJlbnQ6CiAgICAgICAgICAgICAgICBpZiBbdXJsLCBwb3MgKyAxXSBpbiBuZXh0OgogICAgICAgICAgICAgICAgICAgIHZhbGlkLmFwcGVuZChbdXJsLCBwb3MgKyAxXSkKICAgICAgICBjdXJyZW50ID0gdmFsaWQKICAgIGZvciB1cmwsIHBvcyBpbiBjdXJyZW50OgogICAgICAgICByZXN1bHQuYXBwZW5kKHVybCkKICAgIHJldHVybiByZXN1bHQKCmRlZiBjcmF3bF93ZWIoc2VlZCk6ICMgcmV0dXJucyBpbmRleCwgZ3JhcGggb2YgaW5saW5rcwogICAgdG9jcmF3bCA9IFtzZWVkXQogICAgY3Jhd2xlZCA9IFtdCiAgICBncmFwaCA9IHt9ICAjIDx1cmw+LCBbbGlzdCBvZiBwYWdlcyBpdCBsaW5rcyB0b10KICAgIGluZGV4ID0ge30gCiAgICB3aGlsZSB0b2NyYXdsOiAKICAgICAgICBwYWdlID0gdG9jcmF3bC5wb3AoKQogICAgICAgIGlmIHBhZ2Ugbm90IGluIGNyYXdsZWQ6CiAgICAgICAgICAgIGNvbnRlbnQgPSBnZXRfcGFnZShwYWdlKQogICAgICAgICAgICBhZGRfcGFnZV90b19pbmRleChpbmRleCwgcGFnZSwgY29udGVudCkKICAgICAgICAgICAgb3V0bGlua3MgPSBnZXRfYWxsX2xpbmtzKGNvbnRlbnQpCiAgICAgICAgICAgIGdyYXBoW3BhZ2VdID0gb3V0bGlua3MKICAgICAgICAgICAgdW5pb24odG9jcmF3bCwgb3V0bGlua3MpCiAgICAgICAgICAgIGNyYXdsZWQuYXBwZW5kKHBhZ2UpCiAgICByZXR1cm4gaW5kZXgsIGdyYXBoCgoKZGVmIGdldF9uZXh0X3RhcmdldChwYWdlKToKICAgIHN0YXJ0X2xpbmsgPSBwYWdlLmZpbmQoJzxhIGhyZWY9JykKICAgIGlmIHN0YXJ0X2xpbmsgPT0gLTE6IAogICAgICAgIHJldHVybiBOb25lLCAwCiAgICBzdGFydF9xdW90ZSA9IHBhZ2UuZmluZCgnIicsIHN0YXJ0X2xpbmspCiAgICBlbmRfcXVvdGUgPSBwYWdlLmZpbmQoJyInLCBzdGFydF9xdW90ZSArIDEpCiAgICB1cmwgPSBwYWdlW3N0YXJ0X3F1b3RlICsgMTplbmRfcXVvdGVdCiAgICByZXR1cm4gdXJsLCBlbmRfcXVvdGUKCmRlZiBnZXRfYWxsX2xpbmtzKHBhZ2UpOgogICAgbGlua3MgPSBbXQogICAgd2hpbGUgVHJ1ZToKICAgICAgICB1cmwsIGVuZHBvcyA9IGdldF9uZXh0X3RhcmdldChwYWdlKQogICAgICAgIGlmIHVybDoKICAgICAgICAgICAgbGlua3MuYXBwZW5kKHVybCkKICAgICAgICAgICAgcGFnZSA9IHBhZ2VbZW5kcG9zOl0KICAgICAgICBlbHNlOgogICAgICAgICAgICBicmVhawogICAgcmV0dXJuIGxpbmtzCgoKZGVmIHVuaW9uKGEsIGIpOgogICAgZm9yIGUgaW4gYjoKICAgICAgICBpZiBlIG5vdCBpbiBhOgogICAgICAgICAgICBhLmFwcGVuZChlKQoKZGVmIGFkZF9wYWdlX3RvX2luZGV4KGluZGV4LCB1cmwsIGNvbnRlbnQpOgogICAgd29yZHMgPSBjb250ZW50LnNwbGl0KCkKICAgIGZvciBwb3NpdGlvbiwgd29yZCBpbiBlbnVtZXJhdGUod29yZHMpOgogICAgICAgIGFkZF90b19pbmRleChpbmRleCwgd29yZCwgdXJsLCBwb3NpdGlvbikKICAgICAgICAKZGVmIGFkZF90b19pbmRleChpbmRleCwga2V5d29yZCwgdXJsLCBwb3NpdGlvbik6CiAgICBpZiBrZXl3b3JkIGluIGluZGV4OgogICAgICAgIGluZGV4W2tleXdvcmRdLmFwcGVuZChbdXJsLCBwb3NpdGlvbl0pCiAgICBlbHNlOgogICAgICAgIGluZGV4W2tleXdvcmRdID0gW1t1cmwsIHBvc2l0aW9uXV0KCmRlZiBsb29rdXAoaW5kZXgsIGtleXdvcmQpOgogICAgaWYga2V5d29yZCBpbiBpbmRleDoKICAgICAgICByZXR1cm4gaW5kZXhba2V5d29yZF0KICAgIGVsc2U6CiAgICAgICAgcmV0dXJuIE5vbmUKICAgIAoKCgpjYWNoZSA9IHsKICAgJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi55LmNvbS9jczEwMXgvZmluYWwvbXVsdGkuaHRtbCc6ICIiIjxodG1sPgo8Ym9keT4KCjxhIGhyZWY9Imh0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi55LmNvbS9jczEwMXgvZmluYWwvYS5odG1sIj5BPC9hPjxicj4KPGEgaHJlZj0iaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9iLmh0bWwiPkI8L2E+PGJyPgoKPC9ib2R5PgoiIiIsIAogICAnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9iLmh0bWwnOiAiIiI8aHRtbD4KPGJvZHk+CgpNb250eSBsaWtlcyB0aGUgUHl0aG9uIHByb2dyYW1taW5nIGxhbmd1YWdlClRob21hcyBKZWZmZXJzb24gZm91bmRlZCB0aGUgVW5pdmVyc2l0eSBvZiBWaXJnaW5pYQpXaGVuIE1hbmRlbGEgd2FzIGluIExvbmRvbiwgaGUgdmlzaXRlZCBOZWxzb24ncyBDb2x1bW4uCgo8L2JvZHk+CjwvaHRtbD4KIiIiLCAKICAgJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi55LmNvbS9jczEwMXgvZmluYWwvYS5odG1sJzogIiIiPGh0bWw+Cjxib2R5PgoKTW9udHkgUHl0aG9uIGlzIG5vdCBhYm91dCBhIHByb2dyYW1taW5nIGxhbmd1YWdlClVkYWNpdHkgd2FzIG5vdCBmb3VuZGVkIGJ5IFRob21hcyBKZWZmZXJzb24KTmVsc29uIE1hbmRlbGEgc2FpZCAiRWR1Y2F0aW9uIGlzIHRoZSBtb3N0IHBvd2VyZnVsIHdlYXBvbiB3aGljaCB5b3UgY2FuCnVzZSB0byBjaGFuZ2UgdGhlIHdvcmxkLiIKPC9ib2R5Pgo8L2h0bWw+CiIiIiwgCn0KCmRlZiBnZXRfcGFnZSh1cmwpOgogICAgaWYgdXJsIGluIGNhY2hlOgogICAgICAgIHJldHVybiBjYWNoZVt1cmxdCiAgICBlbHNlOgogICAgICAgIHByaW50ICJQYWdlIG5vdCBpbiBjYWNoZTogIiArIHVybAogICAgICAgIHJldHVybiBOb25lCiAgICAKCgoKCgojSGVyZSBhcmUgYSBmZXcgZXhhbXBsZXMgZnJvbSB0aGUgdGVzdCBzaXRlOgoKaW5kZXgsIGdyYXBoID0gY3Jhd2xfd2ViKCdodHRwOi8vdy4uLmNvbnRlbnQtYXZhaWxhYmxlLXRvLWF1dGhvci1vbmx5Li4ueS5jb20vY3MxMDF4L2ZpbmFsL211bHRpLmh0bWwnKQoKcHJpbnQgbXVsdGlfbG9va3VwKGluZGV4LCBbJ1B5dGhvbiddKQojPj4+IFsnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9iLmh0bWwnLCAnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9hLmh0bWwnXQoKcHJpbnQgbXVsdGlfbG9va3VwKGluZGV4LCBbJ01vbnR5JywgJ1B5dGhvbiddKQojPj4+IFsnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9hLmh0bWwnXQoKcHJpbnQgbXVsdGlfbG9va3VwKGluZGV4LCBbJ1B5dGhvbicsICdwcm9ncmFtbWluZycsICdsYW5ndWFnZSddKQojPj4+IFsnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9iLmh0bWwnXQoKcHJpbnQgbXVsdGlfbG9va3VwKGluZGV4LCBbJ1Rob21hcycsICdKZWZmZXJzb24nXSkKIz4+PiBbJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi55LmNvbS9jczEwMXgvZmluYWwvYi5odG1sJywgJ2h0dHA6Ly93Li4uY29udGVudC1hdmFpbGFibGUtdG8tYXV0aG9yLW9ubHkuLi55LmNvbS9jczEwMXgvZmluYWwvYS5odG1sJ10KCnByaW50IG11bHRpX2xvb2t1cChpbmRleCwgWydtb3N0JywgJ3Bvd2VyZnVsJywgJ3dlYXBvbiddKQojPj4+IFsnaHR0cDovL3cuLi5jb250ZW50LWF2YWlsYWJsZS10by1hdXRob3Itb25seS4uLnkuY29tL2NzMTAxeC9maW5hbC9hLmh0bWwnXQ==