# example: I am extracting car plate numbers that always follow patern [A-Z]{2}\d{5}
# patterns might differ for other example, but will always be some alfa-numeric combination
# complex patterns may be ignored with some warning like "unable to parse"
import re
def post_process(pattern, text, ambiguous_dict):
# get text[0], check pattern
# in this case, should be letter, if no, try to replace from dict, if yes, pass
# continue with next letters until a match is found or looped the whole text
matches = re.findall(pattern, text)
if len(matches):
return [x.translate(ambiguous_dict) for x in matches]
else:
return None
ambiguous_dict = {ord('2'): 'Z', ord('B'): '8'}
# My plate photo text: AZ45287
# Noise is fairly easy to filter out by filtering on tesseract confidence level, although not ideal
# so, if a function cannot be made that would find a match through the noise
# the noise can be ignored in favor of a simpler fucntion that can just find a match
ocr_output = "someNoise A2452B7 no1Ze"
# 2 is replaced by Z, B is replaced by 8. It would be acceptable if the function will
# do this iteratively for each element of ocr_output until pattern is matched or return None
# Any other functionally similar (recursive, generator, other) approach is also acceptable.
result = post_process(r"[A-Z2]{2}[\dB]{5}", ocr_output, ambiguous_dict)
if result:
print(result) # AZ45287
else: # result is none
print("failed to clean output")
IyBleGFtcGxlOiBJIGFtIGV4dHJhY3RpbmcgY2FyIHBsYXRlIG51bWJlcnMgdGhhdCBhbHdheXMgZm9sbG93IHBhdGVybiBbQS1aXXsyfVxkezV9CiMgcGF0dGVybnMgbWlnaHQgZGlmZmVyIGZvciBvdGhlciBleGFtcGxlLCBidXQgd2lsbCBhbHdheXMgYmUgc29tZSBhbGZhLW51bWVyaWMgY29tYmluYXRpb24KIyBjb21wbGV4IHBhdHRlcm5zIG1heSBiZSBpZ25vcmVkIHdpdGggc29tZSB3YXJuaW5nIGxpa2UgInVuYWJsZSB0byBwYXJzZSIgCgppbXBvcnQgcmUKZGVmIHBvc3RfcHJvY2VzcyhwYXR0ZXJuLCB0ZXh0LCBhbWJpZ3VvdXNfZGljdCk6CiAgICAjIGdldCB0ZXh0WzBdLCBjaGVjayBwYXR0ZXJuCiAgICAjIGluIHRoaXMgY2FzZSwgc2hvdWxkIGJlIGxldHRlciwgaWYgbm8sIHRyeSB0byByZXBsYWNlIGZyb20gZGljdCwgaWYgeWVzLCBwYXNzCiAgICAjIGNvbnRpbnVlIHdpdGggbmV4dCBsZXR0ZXJzIHVudGlsIGEgbWF0Y2ggaXMgZm91bmQgb3IgbG9vcGVkIHRoZSB3aG9sZSB0ZXh0CiAgICBtYXRjaGVzID0gcmUuZmluZGFsbChwYXR0ZXJuLCB0ZXh0KQogICAgaWYgbGVuKG1hdGNoZXMpOgogICAgICAgIHJldHVybiBbeC50cmFuc2xhdGUoYW1iaWd1b3VzX2RpY3QpIGZvciB4IGluIG1hdGNoZXNdCiAgICBlbHNlOgogICAgICAgIHJldHVybiBOb25lCgphbWJpZ3VvdXNfZGljdCA9IHtvcmQoJzInKTogJ1onLCBvcmQoJ0InKTogJzgnfQoKIyBNeSBwbGF0ZSBwaG90byB0ZXh0OiBBWjQ1Mjg3CiMgTm9pc2UgaXMgZmFpcmx5IGVhc3kgdG8gZmlsdGVyIG91dCBieSBmaWx0ZXJpbmcgb24gdGVzc2VyYWN0IGNvbmZpZGVuY2UgbGV2ZWwsIGFsdGhvdWdoIG5vdCBpZGVhbAojIHNvLCBpZiBhIGZ1bmN0aW9uIGNhbm5vdCBiZSBtYWRlIHRoYXQgd291bGQgZmluZCBhIG1hdGNoIHRocm91Z2ggdGhlIG5vaXNlCiMgdGhlIG5vaXNlIGNhbiBiZSBpZ25vcmVkIGluIGZhdm9yIG9mIGEgc2ltcGxlciBmdWNudGlvbiB0aGF0IGNhbiBqdXN0IGZpbmQgYSBtYXRjaCAgCm9jcl9vdXRwdXQgPSAic29tZU5vaXNlIEEyNDUyQjcgbm8xWmUiIAoKIyAyIGlzIHJlcGxhY2VkIGJ5IFosIEIgaXMgcmVwbGFjZWQgYnkgOC4gSXQgd291bGQgYmUgYWNjZXB0YWJsZSBpZiB0aGUgZnVuY3Rpb24gd2lsbAojIGRvIHRoaXMgaXRlcmF0aXZlbHkgZm9yIGVhY2ggZWxlbWVudCBvZiBvY3Jfb3V0cHV0IHVudGlsIHBhdHRlcm4gaXMgbWF0Y2hlZCBvciByZXR1cm4gTm9uZQojIEFueSBvdGhlciBmdW5jdGlvbmFsbHkgc2ltaWxhciAocmVjdXJzaXZlLCBnZW5lcmF0b3IsIG90aGVyKSBhcHByb2FjaCBpcyBhbHNvIGFjY2VwdGFibGUuIApyZXN1bHQgPSBwb3N0X3Byb2Nlc3MociJbQS1aMl17Mn1bXGRCXXs1fSIsIG9jcl9vdXRwdXQsIGFtYmlndW91c19kaWN0KQoKaWYgcmVzdWx0OgogICAgcHJpbnQocmVzdWx0KSAjIEFaNDUyODcKZWxzZTogIyByZXN1bHQgaXMgbm9uZQogICAgcHJpbnQoImZhaWxlZCB0byBjbGVhbiBvdXRwdXQiKQoK