import sys
from optparse import OptionParser
import urllib
import urllib2
import bs4
from collections import defaultdict
import pickle
MAIN_URI='http://s...content-available-to-author-only...o.com/ncaa/football/recruiting/recruit-search'
SEARCH_URI='http://s...content-available-to-author-only...o.com/ncaa/football/recruiting/recruit-search-results'
def handleOptions():
argParser = OptionParser(description='Compare common recruits from two schools based upon Rivals rankings.')
argParser.add_option('-l', '--list', action='store_true', dest='list', default=False, help='List available schools')
argParser.add_option('-1', '--first', action='store', dest='first', default='', help='First school to compare')
argParser.add_option('-2', '--second', action='store', dest='second', default='', help='Second school to compare')
argParser.add_option('-y', '--year', action='store', dest='year', default='2014', help='Year to compare')
argParser.add_option('-s', '--stats', action='store_true', dest='stats', default=False, help='Print advanced class statistics')
argParser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='Use verbose output')
argParser.add_option('', '--save', action='store_true', dest='save', default=False, help='Save scraped output to intermediate file')
argParser.add_option('', '--load', action='store_true', dest='load', default=False, help='Load scraped output from intermediate file')
return argParser.parse_args()
class Recruit(object):
def __init__(self, row):
#self.row = row
self.pos = str(row.contents[0].string)
self.name = str(row.contents[1].a.string)
self.location = str(row.contents[2].string)
self.height = str(row.contents[3].string)
self.weight = str(row.contents[4].string)
self.forty = str(row.contents[5].string)
stars = row.contents[6]
if stars.span:
self.stars = int(stars.span.string.split()[0])
else:
self.stars = 0
try:
self.rating = float(row.contents[7].string)
except ValueError:
self.rating = 0.0
self.rank = str(row.contents[8].string)
self.school = str(row.contents[9].div.a.string)
def getHtml(uri, values={}):
"""
Retrieves HTML returned by the given URI.
"""
data = urllib.urlencode(values, True)
req = urllib2.Request(uri, data)
req.add_header('User-agent', 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11')
html = None
f = urllib2.urlopen(req)
try:
html = f.read()
finally:
f.close()
return html
def getSchoolList():
"""
Returns a list of school names valid to search for.
"""
html = getHtml(MAIN_URI)
soup = bs4.BeautifulSoup(html, 'html5lib')
select = soup.find(id='school-football')
return [s['value'] for s in select.find_all('option')[1:]]
def getStartValues(soup):
"""
Returns a list of start value tuples that define a range of recruit numbers suitable for acting as page navigation.
"""
navDiv = soup.find(id='ysr-search-results-index-top')
pageList = navDiv.ul
startValues = set()
for li in pageList.find_all('li'):
if 'class' in li.attrs and 'summary' in li['class']:
continue
if 'disabled' in li.form.button.attrs:
continue
buttonStart = li.form.button['value']
hiddenStart = None
for inputTag in li.form.find_all('input'):
if inputTag['name'] == 'start':
hiddenStart = inputTag['value']
startValues.add((buttonStart, hiddenStart))
return sorted(startValues)
def getRecruits(schoolName, year):
"""
Returns a map of recruit names and Recruit objects that are all the committed recruits to the given school for the given year.
"""
values = {'sport': 'football', 'year': year, 'college': schoolName, 'offered': '1', 'hsprospects': '1', 'prepprospects': '1', 'jucoprospects': 1, 'sort_columns': 'rivalsrating'}
html = getHtml(SEARCH_URI, values)
soup = bs4.BeautifulSoup(html, 'html5lib')
startValues = getStartValues(soup)
soups = [soup]
for start in startValues:
values['start'] = start
html = getHtml(SEARCH_URI, values)
soup = bs4.BeautifulSoup(html, 'html5lib')
soups.append(soup)
recruits = {}
for soup in soups:
table = soup.find(id='ysr-search-results')
body = table.tbody
rows = body.find_all('tr')
for row in rows:
recruit = Recruit(row)
recruits[recruit.name] = recruit
return recruits
def printStats(schoolName, recruits, isSummary=False):
"""
Print a variety statistics about the given recruiting class.
"""
def median(ratings):
if len(ratings) == 0:
return 0.0
if len(ratings) == 1:
return ratings[0]*1.0
if len(ratings) % 2 == 0:
mid = len(ratings)/2
mid = ratings[mid-1:mid+1]
return sum(mid)/len(mid)
else:
return ratings[len(ratings)/2]
offeredRatings = sorted([recruit.rating for recruit in recruits.values() if recruit.rating != 0])
committedRatings = sorted([recruit.rating for recruit in recruits.values() if recruit.rating != 0 and recruit.school == schoolName])
offeredStars = [recruit.stars for recruit in recruits.values()]
committedStars = [recruit.stars for recruit in recruits.values() if recruit.school == schoolName]
#Offered mean/median
offeredMeanRating = sum(offeredRatings)/len(offeredRatings)
offeredMedianRating = median(offeredRatings)
#Committed mean/median
committedMeanRating = sum(committedRatings)/len(committedRatings)
committedMedianRating = median(offeredRatings)
#Commits/offer
commits = 0
for recruit in recruits.itervalues():
if recruit.school == schoolName:
commits += 1
#Offered by star
offeredStarBuckets = defaultdict(int)
for star in offeredStars:
offeredStarBuckets[star] += 1
#Commits by star
committedStarBuckets = defaultdict(int)
for star in committedStars:
committedStarBuckets[star] += 1
#Output
header = '%s recruits (%d):' % (schoolName, len(recruits))
print '\n'+header
print '='*len(header)
sortedRecruits = sorted([(recruit.rating, recruit) for recruit in recruits.itervalues()], reverse=True)
for (recruitName, recruit) in sortedRecruits:
print ' %s - %s - %s' % (recruit.name, recruit.rating, recruit.school)
print '\nAverage Offered Rating: %.2f' % offeredMeanRating
if not isSummary:
print 'Average Committed Rating: %.2f' % committedMeanRating
print '\nMedian Offered Rating: %.2f' % offeredMedianRating
if not isSummary:
print 'Median Committed Rating: %.2f' % committedMedianRating
print '\nCommits: %d' % commits
if not isSummary:
print 'Commits per offer: %.2f' % (float(commits)/len(recruits))
starTableHeader = ['# of Stars:', 'Offered:', 'Committed:']
colWidths = [len(header) for header in starTableHeader]
print '\n'+' '.join(starTableHeader)
print ' '.join(['='*width for width in colWidths])
for i in range(5,2,-1):
print '%*d %*d %*d' % (colWidths[0], i, colWidths[1], offeredStarBuckets[i], colWidths[2], committedStarBuckets[i])
def dictInstersection(left, right):
"""
Returns a dict that is the intersection between the two input dictionaries. Uses the values from the first dictionary argument.
"""
intersection = {}
intersectNames = left.viewkeys() & right.viewkeys()
return {name: left[name] for name in intersectNames}
def error(msg, status=1):
if msg:
sys.stderr.write(msg)
sys.exit(status)
def main():
(options, args) = handleOptions()
if options.verbose:
print 'Retrieving list of schools...'
schoolList = getSchoolList()
if options.list:
print 'Schools:'
print '========'
for school in schoolList:
print school
return
lowerSchools = [s.lower() for s in schoolList]
if options.first.lower() not in lowerSchools:
error("First school '%s' could not be found on Rivals list." % options.first)
compare = len(options.second.strip()) > 0
if compare and options.second.lower() not in lowerSchools:
error("Second school '%s' could not be found on Rivals list." % options.second)
secondRecruits = {}
if not options.load:
if options.verbose:
print 'Retrieving %s recruits...' % options.first
firstRecruits = getRecruits(options.first, options.year)
if compare:
if options.verbose:
print 'Retrieving %s recruits...' % options.second
secondRecruits = getRecruits(options.second, options.year)
pickleFile = '%s_%s_%s.pkl' % (options.year, options.first.lower().replace(' ', '-'), options.second.lower().replace(' ', '-'))
if options.save:
tmp = (options.year, options.first, firstRecruits, options.second, secondRecruits)
with open(pickleFile, 'wb') as f:
pickle.dump(tmp, f)
if options.load:
with open(pickleFile, 'rb') as f:
(options.year, options.first, firstRecruits, options.second, secondRecruits) = pickle.load(f)
if options.stats:
printStats(options.first, firstRecruits)
if compare:
printStats(options.second, secondRecruits)
print '\n-------------------------------------'
if compare:
jointRecruits = dictInstersection(firstRecruits, secondRecruits)
if options.stats:
print '\n# of common recruits: %d' % len(jointRecruits)
commonFirstRecruits = {name: recruit for (name, recruit) in jointRecruits.items() if recruit.school == options.first}
commonSecondRecruits = {name: recruit for (name, recruit) in jointRecruits.items() if recruit.school == options.second}
printStats(options.first, commonFirstRecruits, True)
printStats(options.second, commonSecondRecruits, True)
else:
committedRecruits = {name: recruit for (name, recruit) in firstRecruits.items() if recruit.school == options.first}
printStats(options.first, committedRecruits, True)
if __name__ == '__main__':
main()