import re
subtitle_match_count = 0
rx = r"""
(^[0-9]{2}:[0-9]{2}:[0-9]{2}[.,][0-9]{3}) # match TC-IN in group1
[ ]-->[ ] # VTT/SRT style TC-IN--TC-OUT separator
([0-9]{2}:[0-9]{2}:[0-9]{2}[.,][0-9]{3}) # match TC-OUT n group2
(.*)\r ?\n # additional VTT info (like) alignment
([\s \S ]*?)\s *(?:(?:\r ?\n ){2}|\Z ) # subtitle_content
"""
s = "WEBVTT\n \n 00:00:00.440 --> 00:00:02.320 align:middle line:-1\n Hi.\n \n 00:00:03.440 --> 00:00:07.520 align:middle line:-1\n This subtitle has one line.\n \n 00:00:09.240 --> 00:00:11.080 align:middle line:-2\n This subtitle has\n two lines.\n \n 00:00:15.240 --> 00:00:23.960 align:middle line:-4\n Now...\n Let's try\n four...\n lines...\n \n 00:00:24.080 --> 00:00:27.080 align:middle"
matches = re .finditer ( rx, s, re .VERBOSE | re .MULTILINE )
for match in matches:
subtitle_match_count += 1
group1, group2, group3, group4 = match.groups ( )
tc_in = group1.strip ( )
tc_out = group2.strip ( )
vtt_extra_info = group3
subtitle_content = group4
print "*** subtitle match count: %d ***" % subtitle_match_count
print "TIMECODE IN" .ljust ( 20 ) , tc_in
print "TIMECODE OUT" .ljust ( 20 ) , tc_out
print "ALIGN" .ljust ( 20 ) , vtt_extra_info.strip ( )
print "SUBTITLE CONTENT" .ljust ( 20 ) , subtitle_content
print
aW1wb3J0IHJlCgpzdWJ0aXRsZV9tYXRjaF9jb3VudCA9IDAKcnggPSByIiIiCgkoXlswLTldezJ9OlswLTldezJ9OlswLTldezJ9Wy4sXVswLTldezN9KSAgICMgbWF0Y2ggVEMtSU4gaW4gZ3JvdXAxCglbIF0tLT5bIF0gICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIFZUVC9TUlQgc3R5bGUgVEMtSU4tLVRDLU9VVCBzZXBhcmF0b3IKCShbMC05XXsyfTpbMC05XXsyfTpbMC05XXsyfVsuLF1bMC05XXszfSkgICAgIyBtYXRjaCBUQy1PVVQgbiBncm91cDIKCSguKilccj9cbiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgYWRkaXRpb25hbCBWVFQgaW5mbyAobGlrZSkgYWxpZ25tZW50CgkoW1xzXFNdKj8pXHMqKD86KD86XHI/XG4pezJ9fFxaKSAgICAgICAgICAgICAgICAjIHN1YnRpdGxlX2NvbnRlbnQKCSIiIgoKcyA9ICJXRUJWVFRcblxuMDA6MDA6MDAuNDQwIC0tPiAwMDowMDowMi4zMjAgYWxpZ246bWlkZGxlIGxpbmU6LTFcbkhpLlxuXG4wMDowMDowMy40NDAgLS0+IDAwOjAwOjA3LjUyMCBhbGlnbjptaWRkbGUgbGluZTotMVxuVGhpcyBzdWJ0aXRsZSBoYXMgb25lIGxpbmUuXG5cbjAwOjAwOjA5LjI0MCAtLT4gMDA6MDA6MTEuMDgwIGFsaWduOm1pZGRsZSBsaW5lOi0yXG5UaGlzIHN1YnRpdGxlIGhhc1xudHdvIGxpbmVzLlxuXG4wMDowMDoxNS4yNDAgLS0+IDAwOjAwOjIzLjk2MCBhbGlnbjptaWRkbGUgbGluZTotNFxuTm93Li4uXG5MZXQncyB0cnlcbmZvdXIuLi5cbmxpbmVzLi4uXG5cbjAwOjAwOjI0LjA4MCAtLT4gMDA6MDA6MjcuMDgwIGFsaWduOm1pZGRsZSIKCm1hdGNoZXMgPSByZS5maW5kaXRlcihyeCwgcywgcmUuVkVSQk9TRSB8IHJlLk1VTFRJTElORSkKZm9yIG1hdGNoIGluIG1hdGNoZXM6CiAgICBzdWJ0aXRsZV9tYXRjaF9jb3VudCArPSAxCiAgICBncm91cDEsIGdyb3VwMiwgZ3JvdXAzLCBncm91cDQgPSBtYXRjaC5ncm91cHMoKQogICAgdGNfaW4gPSBncm91cDEuc3RyaXAoKQogICAgdGNfb3V0ID0gZ3JvdXAyLnN0cmlwKCkKICAgIHZ0dF9leHRyYV9pbmZvID0gZ3JvdXAzCiAgICBzdWJ0aXRsZV9jb250ZW50ID0gZ3JvdXA0CiAgICBwcmludCAiKioqIHN1YnRpdGxlIG1hdGNoIGNvdW50OiAlZCAqKioiICUgc3VidGl0bGVfbWF0Y2hfY291bnQKICAgIHByaW50ICJUSU1FQ09ERSBJTiIubGp1c3QoMjApLCB0Y19pbgogICAgcHJpbnQgIlRJTUVDT0RFIE9VVCIubGp1c3QoMjApLCB0Y19vdXQKICAgIHByaW50ICJBTElHTiIubGp1c3QoMjApLCB2dHRfZXh0cmFfaW5mby5zdHJpcCgpCiAgICBwcmludCAiU1VCVElUTEUgQ09OVEVOVCIubGp1c3QoMjApLCBzdWJ0aXRsZV9jb250ZW50CiAgICBwcmludA==