fork(1) download
  1. import re
  2.  
  3. subtitle_match_count = 0
  4. rx = r"""
  5. (^[0-9]{2}:[0-9]{2}:[0-9]{2}[.,][0-9]{3}) # match TC-IN in group1
  6. [ ]-->[ ] # VTT/SRT style TC-IN--TC-OUT separator
  7. ([0-9]{2}:[0-9]{2}:[0-9]{2}[.,][0-9]{3}) # match TC-OUT n group2
  8. (.*)\r?\n # additional VTT info (like) alignment
  9. ([\s\S]*?)\s*(?:(?:\r?\n){2}|\Z) # subtitle_content
  10. """
  11.  
  12. s = "WEBVTT\n\n00:00:00.440 --> 00:00:02.320 align:middle line:-1\nHi.\n\n00:00:03.440 --> 00:00:07.520 align:middle line:-1\nThis subtitle has one line.\n\n00:00:09.240 --> 00:00:11.080 align:middle line:-2\nThis subtitle has\ntwo lines.\n\n00:00:15.240 --> 00:00:23.960 align:middle line:-4\nNow...\nLet's try\nfour...\nlines...\n\n00:00:24.080 --> 00:00:27.080 align:middle"
  13.  
  14. matches = re.finditer(rx, s, re.VERBOSE | re.MULTILINE)
  15. for match in matches:
  16. subtitle_match_count += 1
  17. group1, group2, group3, group4 = match.groups()
  18. tc_in = group1.strip()
  19. tc_out = group2.strip()
  20. vtt_extra_info = group3
  21. subtitle_content = group4
  22. print "*** subtitle match count: %d ***" % subtitle_match_count
  23. print "TIMECODE IN".ljust(20), tc_in
  24. print "TIMECODE OUT".ljust(20), tc_out
  25. print "ALIGN".ljust(20), vtt_extra_info.strip()
  26. print "SUBTITLE CONTENT".ljust(20), subtitle_content
  27. print
Success #stdin #stdout 0s 23352KB
stdin
Standard input is empty
stdout
*** subtitle match count: 1 ***
TIMECODE IN          00:00:00.440
TIMECODE OUT         00:00:02.320
ALIGN                align:middle line:-1
SUBTITLE CONTENT     Hi.

*** subtitle match count: 2 ***
TIMECODE IN          00:00:03.440
TIMECODE OUT         00:00:07.520
ALIGN                align:middle line:-1
SUBTITLE CONTENT     This subtitle has one line.

*** subtitle match count: 3 ***
TIMECODE IN          00:00:09.240
TIMECODE OUT         00:00:11.080
ALIGN                align:middle line:-2
SUBTITLE CONTENT     This subtitle has
two lines.

*** subtitle match count: 4 ***
TIMECODE IN          00:00:15.240
TIMECODE OUT         00:00:23.960
ALIGN                align:middle line:-4
SUBTITLE CONTENT     Now...
Let's try
four...
lines...