fork download
  1. import sys
  2. from optparse import OptionParser
  3. import urllib
  4. import urllib2
  5. import bs4
  6. from collections import defaultdict
  7. import pickle
  8.  
  9. MAIN_URI='http://s...content-available-to-author-only...o.com/ncaa/football/recruiting/recruit-search'
  10. SEARCH_URI='http://s...content-available-to-author-only...o.com/ncaa/football/recruiting/recruit-search-results'
  11.  
  12. def handleOptions():
  13. argParser = OptionParser(description='Compare common recruits from two schools based upon Rivals rankings.')
  14. argParser.add_option('-l', '--list', action='store_true', dest='list', default=False, help='List available schools')
  15. argParser.add_option('-1', '--first', action='store', dest='first', default='', help='First school to compare')
  16. argParser.add_option('-2', '--second', action='store', dest='second', default='', help='Second school to compare')
  17. argParser.add_option('-y', '--year', action='store', dest='year', default='2014', help='Year to compare')
  18. argParser.add_option('-s', '--stats', action='store_true', dest='stats', default=False, help='Print advanced class statistics')
  19. argParser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='Use verbose output')
  20.  
  21. argParser.add_option('', '--save', action='store_true', dest='save', default=False, help='Save scraped output to intermediate file')
  22. argParser.add_option('', '--load', action='store_true', dest='load', default=False, help='Load scraped output from intermediate file')
  23.  
  24. return argParser.parse_args()
  25.  
  26. class Recruit(object):
  27. def __init__(self, row):
  28. #self.row = row
  29. self.pos = str(row.contents[0].string)
  30. self.name = str(row.contents[1].a.string)
  31. self.location = str(row.contents[2].string)
  32. self.height = str(row.contents[3].string)
  33. self.weight = str(row.contents[4].string)
  34. self.forty = str(row.contents[5].string)
  35. stars = row.contents[6]
  36. if stars.span:
  37. self.stars = int(stars.span.string.split()[0])
  38. else:
  39. self.stars = 0
  40. try:
  41. self.rating = float(row.contents[7].string)
  42. except ValueError:
  43. self.rating = 0.0
  44. self.rank = str(row.contents[8].string)
  45. self.school = str(row.contents[9].div.a.string)
  46.  
  47. def getHtml(uri, values={}):
  48. """
  49. Retrieves HTML returned by the given URI.
  50. """
  51. data = urllib.urlencode(values, True)
  52. req = urllib2.Request(uri, data)
  53. req.add_header('User-agent', 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11')
  54.  
  55. html = None
  56. f = urllib2.urlopen(req)
  57. try:
  58. html = f.read()
  59. finally:
  60. f.close()
  61.  
  62. return html
  63.  
  64. def getSchoolList():
  65. """
  66. Returns a list of school names valid to search for.
  67. """
  68. html = getHtml(MAIN_URI)
  69.  
  70. soup = bs4.BeautifulSoup(html, 'html5lib')
  71.  
  72. select = soup.find(id='school-football')
  73. return [s['value'] for s in select.find_all('option')[1:]]
  74.  
  75. def getStartValues(soup):
  76. """
  77. Returns a list of start value tuples that define a range of recruit numbers suitable for acting as page navigation.
  78. """
  79. navDiv = soup.find(id='ysr-search-results-index-top')
  80. pageList = navDiv.ul
  81. startValues = set()
  82. for li in pageList.find_all('li'):
  83. if 'class' in li.attrs and 'summary' in li['class']:
  84. continue
  85.  
  86. if 'disabled' in li.form.button.attrs:
  87. continue
  88.  
  89. buttonStart = li.form.button['value']
  90. hiddenStart = None
  91. for inputTag in li.form.find_all('input'):
  92. if inputTag['name'] == 'start':
  93. hiddenStart = inputTag['value']
  94. startValues.add((buttonStart, hiddenStart))
  95.  
  96. return sorted(startValues)
  97.  
  98. def getRecruits(schoolName, year):
  99. """
  100. Returns a map of recruit names and Recruit objects that are all the committed recruits to the given school for the given year.
  101. """
  102. values = {'sport': 'football', 'year': year, 'college': schoolName, 'offered': '1', 'hsprospects': '1', 'prepprospects': '1', 'jucoprospects': 1, 'sort_columns': 'rivalsrating'}
  103.  
  104. html = getHtml(SEARCH_URI, values)
  105. soup = bs4.BeautifulSoup(html, 'html5lib')
  106.  
  107. startValues = getStartValues(soup)
  108. soups = [soup]
  109. for start in startValues:
  110. values['start'] = start
  111. html = getHtml(SEARCH_URI, values)
  112. soup = bs4.BeautifulSoup(html, 'html5lib')
  113. soups.append(soup)
  114.  
  115. recruits = {}
  116. for soup in soups:
  117. table = soup.find(id='ysr-search-results')
  118. body = table.tbody
  119. rows = body.find_all('tr')
  120. for row in rows:
  121. recruit = Recruit(row)
  122. recruits[recruit.name] = recruit
  123.  
  124. return recruits
  125.  
  126. def printStats(schoolName, recruits, isSummary=False):
  127. """
  128. Print a variety statistics about the given recruiting class.
  129. """
  130. def median(ratings):
  131. if len(ratings) == 0:
  132. return 0.0
  133.  
  134. if len(ratings) == 1:
  135. return ratings[0]*1.0
  136.  
  137. if len(ratings) % 2 == 0:
  138. mid = len(ratings)/2
  139. mid = ratings[mid-1:mid+1]
  140. return sum(mid)/len(mid)
  141. else:
  142. return ratings[len(ratings)/2]
  143.  
  144. offeredRatings = sorted([recruit.rating for recruit in recruits.values() if recruit.rating != 0])
  145. committedRatings = sorted([recruit.rating for recruit in recruits.values() if recruit.rating != 0 and recruit.school == schoolName])
  146. offeredStars = [recruit.stars for recruit in recruits.values()]
  147. committedStars = [recruit.stars for recruit in recruits.values() if recruit.school == schoolName]
  148.  
  149. #Offered mean/median
  150. offeredMeanRating = sum(offeredRatings)/len(offeredRatings)
  151. offeredMedianRating = median(offeredRatings)
  152.  
  153. #Committed mean/median
  154. committedMeanRating = sum(committedRatings)/len(committedRatings)
  155. committedMedianRating = median(offeredRatings)
  156.  
  157. #Commits/offer
  158. commits = 0
  159. for recruit in recruits.itervalues():
  160. if recruit.school == schoolName:
  161. commits += 1
  162.  
  163. #Offered by star
  164. offeredStarBuckets = defaultdict(int)
  165. for star in offeredStars:
  166. offeredStarBuckets[star] += 1
  167.  
  168. #Commits by star
  169. committedStarBuckets = defaultdict(int)
  170. for star in committedStars:
  171. committedStarBuckets[star] += 1
  172.  
  173. #Output
  174. header = '%s recruits (%d):' % (schoolName, len(recruits))
  175. print '\n'+header
  176. print '='*len(header)
  177. sortedRecruits = sorted([(recruit.rating, recruit) for recruit in recruits.itervalues()], reverse=True)
  178. for (recruitName, recruit) in sortedRecruits:
  179. print ' %s - %s - %s' % (recruit.name, recruit.rating, recruit.school)
  180.  
  181. print '\nAverage Offered Rating: %.2f' % offeredMeanRating
  182. if not isSummary:
  183. print 'Average Committed Rating: %.2f' % committedMeanRating
  184.  
  185. print '\nMedian Offered Rating: %.2f' % offeredMedianRating
  186. if not isSummary:
  187. print 'Median Committed Rating: %.2f' % committedMedianRating
  188.  
  189. print '\nCommits: %d' % commits
  190. if not isSummary:
  191. print 'Commits per offer: %.2f' % (float(commits)/len(recruits))
  192.  
  193. starTableHeader = ['# of Stars:', 'Offered:', 'Committed:']
  194. colWidths = [len(header) for header in starTableHeader]
  195. print '\n'+' '.join(starTableHeader)
  196. print ' '.join(['='*width for width in colWidths])
  197. for i in range(5,2,-1):
  198. print '%*d %*d %*d' % (colWidths[0], i, colWidths[1], offeredStarBuckets[i], colWidths[2], committedStarBuckets[i])
  199.  
  200. def dictInstersection(left, right):
  201. """
  202. Returns a dict that is the intersection between the two input dictionaries. Uses the values from the first dictionary argument.
  203. """
  204. intersection = {}
  205. intersectNames = left.viewkeys() & right.viewkeys()
  206. return {name: left[name] for name in intersectNames}
  207.  
  208. def error(msg, status=1):
  209. if msg:
  210. sys.stderr.write(msg)
  211. sys.exit(status)
  212.  
  213. def main():
  214. (options, args) = handleOptions()
  215.  
  216. if options.verbose:
  217. print 'Retrieving list of schools...'
  218. schoolList = getSchoolList()
  219. if options.list:
  220. print 'Schools:'
  221. print '========'
  222. for school in schoolList:
  223. print school
  224. return
  225. lowerSchools = [s.lower() for s in schoolList]
  226.  
  227. if options.first.lower() not in lowerSchools:
  228. error("First school '%s' could not be found on Rivals list." % options.first)
  229.  
  230. compare = len(options.second.strip()) > 0
  231. if compare and options.second.lower() not in lowerSchools:
  232. error("Second school '%s' could not be found on Rivals list." % options.second)
  233.  
  234. secondRecruits = {}
  235. if not options.load:
  236. if options.verbose:
  237. print 'Retrieving %s recruits...' % options.first
  238. firstRecruits = getRecruits(options.first, options.year)
  239. if compare:
  240. if options.verbose:
  241. print 'Retrieving %s recruits...' % options.second
  242. secondRecruits = getRecruits(options.second, options.year)
  243.  
  244. pickleFile = '%s_%s_%s.pkl' % (options.year, options.first.lower().replace(' ', '-'), options.second.lower().replace(' ', '-'))
  245. if options.save:
  246. tmp = (options.year, options.first, firstRecruits, options.second, secondRecruits)
  247. with open(pickleFile, 'wb') as f:
  248. pickle.dump(tmp, f)
  249.  
  250. if options.load:
  251. with open(pickleFile, 'rb') as f:
  252. (options.year, options.first, firstRecruits, options.second, secondRecruits) = pickle.load(f)
  253.  
  254. if options.stats:
  255. printStats(options.first, firstRecruits)
  256. if compare:
  257. printStats(options.second, secondRecruits)
  258.  
  259. print '\n-------------------------------------'
  260. if compare:
  261. jointRecruits = dictInstersection(firstRecruits, secondRecruits)
  262. if options.stats:
  263. print '\n# of common recruits: %d' % len(jointRecruits)
  264. commonFirstRecruits = {name: recruit for (name, recruit) in jointRecruits.items() if recruit.school == options.first}
  265. commonSecondRecruits = {name: recruit for (name, recruit) in jointRecruits.items() if recruit.school == options.second}
  266.  
  267. printStats(options.first, commonFirstRecruits, True)
  268. printStats(options.second, commonSecondRecruits, True)
  269. else:
  270. committedRecruits = {name: recruit for (name, recruit) in firstRecruits.items() if recruit.school == options.first}
  271. printStats(options.first, committedRecruits, True)
  272.  
  273. if __name__ == '__main__':
  274. main()
  275.  
Runtime error #stdin #stdout #stderr 0.18s 12832KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "prog.py", line 7, in <module>
ImportError: No module named bs4