fork(3) download
  1. import re
  2.  
  3. urls = ['http://www.stackoverflow.com/lifestyle/tech/this-is-a-very-nice-headline-my-friend/2013/04/26/acjhrjk-2e1-1krjke4-9el8c-2eheje_story.html?tid=sm_fb',
  4. 'http://www.stackoverflow.com/2015/07/15/sports/baseball/another-very-nice.html?smid=tw-somedia&seid=auto',
  5. 'http://w...content-available-to-author-only...k.com/news/2013/07/22/54216-hello-another-one-here?lite',
  6. 'http://w...content-available-to-author-only...k.com/article_email/hello-one-here-that-is-cool-1545545554-lMyQjAxMTAHFJELMDgxWj',
  7. 'http://w...content-available-to-author-only...k.com/2013/11/13/tech/tricky-one/the-real-one/index.html',
  8. 'http://w...content-available-to-author-only...k.com/2013/11/13/tech/the-good-one.html',
  9. 'http://w...content-available-to-author-only...k.com/news/science-and-technology/54512-hello-world-here-is-a-weird-character#b02g07f20b14']
  10.  
  11. regex = re.compile(r'(?<=/)([-\w]+)(?=[.?/#]|$)')
  12. digits = re.compile(r'-?\d{3,}-?')
  13.  
  14. for url in urls:
  15. substrings = regex.findall(url)
  16. longest = max(substrings, key=len)
  17. headline = re.sub(digits, '', longest)
  18. print headline
Success #stdin #stdout 0.01s 9016KB
stdin
Standard input is empty
stdout
this-is-a-very-nice-headline-my-friend
another-very-nice
hello-another-one-here
hello-one-here-that-is-coollMyQjAxMTAHFJELMDgxWj
the-real-one
the-good-one
hello-world-here-is-a-weird-character