fork download
  1. #!/usr/bin/env python
  2. # Developed with pyton 2.7
  3. # Copyright 2012 Logan Ding <logan.ding@gmail.com>. All Rights Reserved.
  4. #
  5. #---------------------------------------------
  6. # Coursera.org Downloader <Version 1.1>
  7. # by Logan Ding
  8. #---------------------------------------------
  9. #
  10. # Dependent on 'mechanize'. Use 'easy_install mechanize' first if 'mechanize' not installed.
  11. # Be sure to change the email and the password in main() to yours first before running.
  12. #
  13. # Run as: 'python coursera_downloader.py' will download to CWD.
  14. # Run as: 'python coursera_downloader.py <dir>' will download to path <dir>.
  15. #
  16. # Only support single thread to download right now.
  17. # Add courses by yourself. Not all tested. You can feed back.
  18. # Download videos, subtitles, PDF and PPT(X) slides.
  19. # Has problem to resolve subtitles for 'modelthinking'. Ignored...now.
  20.  
  21. import cookielib, re, sys, os
  22. try:
  23. import mechanize
  24. except ImportError, e:
  25. print e
  26. print 'You must install "mechanize" first. Can use "easy_install": easy_install mechanize'
  27. sys.exit(1)
  28.  
  29. def split_string(source,splitlist):
  30. if source == '':
  31. return [source]
  32. result = []
  33. tmp = ''
  34. for c in source:
  35. if c not in splitlist:
  36. tmp += c
  37. else:
  38. if tmp != '':
  39. result.append(tmp)
  40. tmp = ''
  41. if tmp != '':
  42. result.append(tmp)
  43. return result
  44.  
  45. def resolve_name_with_hex(name):
  46. r = re.finditer(r'%\w\w', name)
  47. for m in r:
  48. c = m.group()[1:].decode('hex')
  49. c = c if c not in '\/:*?"<>|' else '_'
  50. name = re.sub(m.group(), c, name)
  51. return name
  52.  
  53. def resolve_name_with_illegal_char(name):
  54. return re.sub(r'[\\/:*?"<>|]', ' -', name)
  55.  
  56. def initialize_browser(course, email, password):
  57. #Use mechanize to handle cookie
  58. print
  59. print 'Initialize browsering session...'
  60. br = mechanize.Browser()
  61. cj = cookielib.LWPCookieJar()
  62. br.set_cookiejar(cj)
  63. br.set_handle_equiv(True)
  64. #br.set_handle_gzip(True)
  65. br.set_handle_redirect(True)
  66. br.set_handle_referer(True)
  67. br.set_handle_robots(False)
  68. br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time = 0)
  69. auth_url = 'https://c...content-available-to-author-only...a.org/****/auth/auth_redirector?type=login&subtype=normal&email'.replace('****', course)
  70. br.open(auth_url)
  71.  
  72. br.select_form(nr = 0)
  73. br.form['email'] = email
  74. br.form['password'] = password
  75. br.submit()
  76. print 'It takes seconds to login and resolve resources to download...\n'
  77.  
  78. #Check if email + password submitted correctly
  79. if 'https://c...content-available-to-author-only...a.org/****/auth/login_receiver?data='.replace('****', course) not in br.geturl():
  80. print 'Failed to login, exit...'
  81. sys.exit(1)
  82.  
  83. video_lectures = 'https://c...content-available-to-author-only...a.org/****/lecture/index'.replace('****', course)
  84. br.open(video_lectures)
  85. return br
  86.  
  87. def resolve_resources(br, path, course):
  88. title = []
  89. pdf = []
  90. pptx = []
  91. link_txt = []
  92. link_srt = []
  93. link_video = []
  94.  
  95. for l in br.links():
  96. m_title = re.search(r'text=[\'\"](.+)[\'\"], tag=\'a\', .+\'class\', \'lecture-link\'', str(l))
  97. m_pdf = re.search(r'https*:[\S]+/([\S]+\.pdf)', str(l))
  98. m_pptx = re.search(r'https*:[\S]+/([\S]+\.pptx*)', str(l))
  99. m_txt = re.search(r'url=\'(https:[\S]+subtitles\?[\S]+=txt)', str(l))
  100. m_srt = re.search(r'url=\'(https:[\S]+subtitles\?[\S]+=srt)', str(l))
  101. m_video = re.search(r'https:[\S]+download.mp4[\S]+\'', str(l))
  102.  
  103. if m_title:
  104. title.append(resolve_name_with_illegal_char(m_title.group(1).strip()))
  105. if m_pdf:
  106. pdf.append([resolve_name_with_hex(m_pdf.group(1)), m_pdf.group()])
  107. if m_pptx:
  108. pptx.append([resolve_name_with_hex(m_pptx.group(1)), m_pptx.group()])
  109. if m_txt:
  110. link_txt.append(m_txt.group(1))
  111. if m_srt:
  112. link_srt.append(m_srt.group(1))
  113. if m_video:
  114. link_video.append(m_video.group().rstrip("'"))
  115.  
  116. if len(title) == len(link_video):
  117. video = zip([t+'.mp4' for t in title], link_video)
  118. else:
  119. print 'Video names resolving error. Ignore videos...'
  120. video = []
  121. # Here is a buggy way to handle different numbers of videos and subtitles for 'modelthinking' and 'saas'.
  122. # To completely solve the problem, need to change the links resolve and match method completely.
  123. # Will fix this if have time. Right now, this inelegant way can handle 'saas' only.
  124. if len(title) == len(link_srt):
  125. srt = zip([t+'.srt' for t in title], link_srt)
  126. elif course == 'saas':
  127. srt = zip([t+'.srt' for t in title[len(title)-len(link_srt) : ]], link_srt)
  128. else:
  129. print 'Can NOT match video names with subtitiles. Ignore...'
  130. srt = []
  131.  
  132. if len(title) == len(link_txt):
  133. txt = zip([t+'.txt' for t in title], link_txt)
  134. elif course == 'saas':
  135. txt = zip([t+'.txt' for t in title[len(title)-len(link_txt) : ]], link_txt)
  136. else:
  137. print 'Can NOT match video names with subtitiles. Ignore...'
  138. txt = []
  139. return video, srt, txt, pdf, pptx
  140.  
  141. def downloader(video, srt, txt, pdf, pptx, br, path):
  142. # Only single download thread supported right now.
  143. print
  144. print 'Videos can be downloaded:'
  145. v = choose_download(video)
  146. print 'srt subtitles can be downloaded:'
  147. s = choose_download(srt)
  148. print 'txt subtitles can be downloaded:'
  149. t = choose_download(txt)
  150. print 'PDF slides can be downloaded:'
  151. f = choose_download(pdf)
  152. print 'PPT slides can be downloaded:'
  153. x = choose_download(pptx)
  154.  
  155. # Combine all to be downloaded together for multiple downloading threads later
  156. all = v + s + t + f + x
  157. for r in all:
  158. filename = os.path.join(path, r[0])
  159. print 'Downloading', r[0]
  160. br.retrieve(r[1], filename)
  161.  
  162. def choose_course(course):
  163. for key in sorted(course.keys()):
  164. print key, ':', course[key]
  165. choice = raw_input('Please choose course by number: ')
  166. while choice not in course.keys():
  167. choice = raw_input('Invalid choice, input again or Enter to quit: ')
  168. if choice == '':
  169. sys.exit(1)
  170. return course[choice]
  171.  
  172. def parse_choice(input):
  173. if input == '':
  174. return input
  175. input = split_string(input, ' ,')
  176. # This split can handle your input as: 1,3,4-5 or 1 3 4-5 or 1, 3, 4-5. Besides, range input support 4-5 or 4:5
  177. choice = []
  178. for e in input:
  179. if e.isdigit():
  180. if e not in choice:
  181. choice.append(int(e))
  182. else:
  183. s = split_string(e, ':-')
  184. if len(s) != 2 or not s[0].isdigit() or not s[1].isdigit():
  185. print 'Ignore invalid input %s' %e
  186. else:
  187. for num in range(int(s[0]), int(s[1])+1):
  188. if num not in choice:
  189. choice.append(num)
  190. return sorted(choice)
  191.  
  192. def choose_download(resource):
  193. for i in range(len(resource)):
  194. print '['+repr(i).rjust(2)+']:', resource[i][0]
  195. print 'Enter your choice, such as: 1, 3, 5-9. Or just Enter to skip.'
  196. choice = raw_input('>')
  197. choice = parse_choice(choice)
  198. print 'To be downloaded:', choice
  199. print
  200. download = []
  201. for i in choice:
  202. if i in range(len(resource)):
  203. download.append(resource[i])
  204. return download
  205.  
  206. def download_path():
  207. if len(sys.argv) > 1:
  208. if not os.path.exists(sys.argv[1]):
  209. try:
  210. os.mkdir(sys.argv[1])
  211. except Exception, error:
  212. print error
  213. sys.exit(1)
  214. return os.path.abspath(sys.argv[1])
  215. else:
  216. return os.path.abspath('.')
  217.  
  218. def main():
  219. print '----------------------------------'
  220. print '- Coursera.org Downloader -'
  221. print '- by Logan Ding -'
  222. print '----------------------------------'
  223. print
  224. # Add courses by yourself. Not all tested. You can feed back.
  225. course = { '1' : 'modelthinking',
  226. '2' : 'gametheory',
  227. '3' : 'crypto',
  228. '4' : 'saas',
  229. '5' : 'pgm',
  230. '6' : 'algo'}
  231.  
  232. # Your Coursera.org email and password needed here to download videos.
  233. email = 'youremail'
  234. password = 'password'
  235.  
  236. if email == 'youremail':
  237. print 'You must change the email and the password to yours in main() first.'
  238. sys.exit(1)
  239.  
  240. path = download_path()
  241. print 'All files will be downloaded to:', path
  242. print
  243. course = choose_course(course)
  244. br = initialize_browser(course, email, password)
  245. vidoe, srt, txt, pdf, pptx = resolve_resources(br, path, course)
  246. downloader(vidoe, srt, txt, pdf, pptx, br, path)
  247.  
  248. if __name__ == '__main__':
  249. main()
  250.  
Runtime error #stdin #stdout 0.02s 5760KB
stdin
Standard input is empty
stdout
Standard output is empty