textstream=[
"This ",
"is ",
"a ",
"reference ",
"[", # Notice that anything can be broken across different messages
"r1] ",
"and ",
"here",
"'s ",
"an ",
"image ",
"of ",
"a ",
"cat <", # A more extreme example, breaking messages completely arbitrary
"ima",
"ge",
">a ",
"black ",
"ca",
"t</i",
"mage>.",
]
charstream=(char for fragment in textstream for char in fragment)
state="text" # state, can switch to "reference" and "tag"
collector="" # temporary storage between state changes
parsed=[] # list of parsed result
for character in charstream:
oldstate=state # 2. (so we recognize state changes)
if character=="[": # 1.
state="reference"
elif character=="]":
state="text"
elif character=="<":
state="tag"
elif character==">":
state="text"
else:
collector+=character
if oldstate!=state and collector: # 2.
parsed.append({oldstate:collector})
collector=""
if collector: # 3.-ish
parsed.append({state:collector}) # could be an error if not state is not "text"
import json
print(json.dumps(parsed,indent=2))
print("-----------------------------------")
result=[]
tag=""
for element in parsed:
if "tag" in element:
if tag=="": # entering a tag
tag=element["tag"]
else:
tag="" # it's an exit, could be validated
else:
if tag=="": # we are at top level
result.append(element)
else: # we use the tag, and also expect "text"
result.append({tag:element["text"]})
print(json.dumps(result,indent=2))
dGV4dHN0cmVhbT1bCiAgIlRoaXMgIiwKICAiaXMgIiwKICAiYSAiLAogICJyZWZlcmVuY2UgIiwKICAiWyIsICAjIE5vdGljZSB0aGF0IGFueXRoaW5nIGNhbiBiZSBicm9rZW4gYWNyb3NzIGRpZmZlcmVudCBtZXNzYWdlcwogICJyMV0gIiwKICAiYW5kICIsCiAgImhlcmUiLAogICIncyAiLAogICJhbiAiLAogICJpbWFnZSAiLAogICJvZiAiLAogICJhICIsCiAgImNhdCA8IiwgICMgQSBtb3JlIGV4dHJlbWUgZXhhbXBsZSwgYnJlYWtpbmcgbWVzc2FnZXMgY29tcGxldGVseSBhcmJpdHJhcnkKICAiaW1hIiwKICAiZ2UiLAogICI+YSAiLAogICJibGFjayAiLAogICJjYSIsCiAgInQ8L2kiLAogICJtYWdlPi4iLApdCmNoYXJzdHJlYW09KGNoYXIgZm9yIGZyYWdtZW50IGluIHRleHRzdHJlYW0gZm9yIGNoYXIgaW4gZnJhZ21lbnQpCgpzdGF0ZT0idGV4dCIgICMgc3RhdGUsIGNhbiBzd2l0Y2ggdG8gInJlZmVyZW5jZSIgYW5kICJ0YWciCmNvbGxlY3Rvcj0iIiAgIyB0ZW1wb3Jhcnkgc3RvcmFnZSBiZXR3ZWVuIHN0YXRlIGNoYW5nZXMKcGFyc2VkPVtdICAgICAjIGxpc3Qgb2YgcGFyc2VkIHJlc3VsdAoKZm9yIGNoYXJhY3RlciBpbiBjaGFyc3RyZWFtOgogIG9sZHN0YXRlPXN0YXRlICAgICAgICAgICAgICMgMi4gKHNvIHdlIHJlY29nbml6ZSBzdGF0ZSBjaGFuZ2VzKQogIGlmIGNoYXJhY3Rlcj09IlsiOiAgICAgICAgICMgMS4KICAgIHN0YXRlPSJyZWZlcmVuY2UiCiAgZWxpZiBjaGFyYWN0ZXI9PSJdIjoKICAgIHN0YXRlPSJ0ZXh0IgogIGVsaWYgY2hhcmFjdGVyPT0iPCI6CiAgICBzdGF0ZT0idGFnIgogIGVsaWYgY2hhcmFjdGVyPT0iPiI6CiAgICBzdGF0ZT0idGV4dCIKICBlbHNlOgogICAgY29sbGVjdG9yKz1jaGFyYWN0ZXIKICBpZiBvbGRzdGF0ZSE9c3RhdGUgYW5kIGNvbGxlY3RvcjogICAgICAgICMgMi4KICAgIHBhcnNlZC5hcHBlbmQoe29sZHN0YXRlOmNvbGxlY3Rvcn0pCiAgICBjb2xsZWN0b3I9IiIKCmlmIGNvbGxlY3RvcjogICAgICAgICAgICAgICAgIyAzLi1pc2gKICBwYXJzZWQuYXBwZW5kKHtzdGF0ZTpjb2xsZWN0b3J9KSAjIGNvdWxkIGJlIGFuIGVycm9yIGlmIG5vdCBzdGF0ZSBpcyBub3QgInRleHQiCgppbXBvcnQganNvbgpwcmludChqc29uLmR1bXBzKHBhcnNlZCxpbmRlbnQ9MikpCgpwcmludCgiLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0iKQoKcmVzdWx0PVtdCnRhZz0iIgpmb3IgZWxlbWVudCBpbiBwYXJzZWQ6CiAgaWYgInRhZyIgaW4gZWxlbWVudDoKICAgIGlmIHRhZz09IiI6ICAgICAgICAgICAjIGVudGVyaW5nIGEgdGFnCiAgICAgIHRhZz1lbGVtZW50WyJ0YWciXQogICAgZWxzZToKICAgICAgdGFnPSIiICAgICAgICAgICAgICAjIGl0J3MgYW4gZXhpdCwgY291bGQgYmUgdmFsaWRhdGVkCiAgZWxzZToKICAgIGlmIHRhZz09IiI6ICAgICAgICAgICAjIHdlIGFyZSBhdCB0b3AgbGV2ZWwKICAgICAgcmVzdWx0LmFwcGVuZChlbGVtZW50KQogICAgZWxzZTogICAgICAgICAgICAgICAgICMgd2UgdXNlIHRoZSB0YWcsIGFuZCBhbHNvIGV4cGVjdCAidGV4dCIKICAgICAgcmVzdWx0LmFwcGVuZCh7dGFnOmVsZW1lbnRbInRleHQiXX0pCgpwcmludChqc29uLmR1bXBzKHJlc3VsdCxpbmRlbnQ9Mikp
[
{
"text": "This is a reference "
},
{
"reference": "r1"
},
{
"text": " and here's an image of a cat "
},
{
"tag": "image"
},
{
"text": "a black cat"
},
{
"tag": "/image"
},
{
"text": "."
}
]
-----------------------------------
[
{
"text": "This is a reference "
},
{
"reference": "r1"
},
{
"text": " and here's an image of a cat "
},
{
"image": "a black cat"
},
{
"text": "."
}
]