# example: I am extracting car plate numbers that always follow patern [A-Z]{2}\d{5}
# patterns might differ for other example, but will always be some alfa-numeric combination
# complex patterns may be ignored with some warning like "unable to parse"
import re
def post_process(pattern, text, ambiguous_dict_1, ambiguous_dict_2):
# get text[0], check pattern
# in this case, should be letter, if no, try to replace from dict, if yes, pass
# continue with next letters until a match is found or looped the whole text
matches = list(re.finditer(pattern, text))
if len(matches):
return [f"{x.group(1).translate(ambiguous_dict_1)}{x.group(2).translate(ambiguous_dict_2)}" for x in matches]
else:
return None
ambiguous_dict_1 = {ord('2'): 'Z'} # For the first group
ambiguous_dict_2 = {ord('B'): '8'} # For the second group
# My plate photo text: AZ45287
# Noise is fairly easy to filter out by filtering on tesseract confidence level, although not ideal
# so, if a function cannot be made that would find a match through the noise
# the noise can be ignored in favor of a simpler fucntion that can just find a match
ocr_output = "someNoise A2452B7 no1Ze"
# 2 is replaced by Z, B is replaced by 8. It would be acceptable if the function will
# do this iteratively for each element of ocr_output until pattern is matched or return None
# Any other functionally similar (recursive, generator, other) approach is also acceptable.
result = post_process(r"([A-Z2]{2})([\dB]{5})", ocr_output, ambiguous_dict_1, ambiguous_dict_2)
if result:
print(result) # AZ45287
else: # result is none
print("failed to clean output")
IyBleGFtcGxlOiBJIGFtIGV4dHJhY3RpbmcgY2FyIHBsYXRlIG51bWJlcnMgdGhhdCBhbHdheXMgZm9sbG93IHBhdGVybiBbQS1aXXsyfVxkezV9CiMgcGF0dGVybnMgbWlnaHQgZGlmZmVyIGZvciBvdGhlciBleGFtcGxlLCBidXQgd2lsbCBhbHdheXMgYmUgc29tZSBhbGZhLW51bWVyaWMgY29tYmluYXRpb24KIyBjb21wbGV4IHBhdHRlcm5zIG1heSBiZSBpZ25vcmVkIHdpdGggc29tZSB3YXJuaW5nIGxpa2UgInVuYWJsZSB0byBwYXJzZSIgCgppbXBvcnQgcmUKZGVmIHBvc3RfcHJvY2VzcyhwYXR0ZXJuLCB0ZXh0LCBhbWJpZ3VvdXNfZGljdF8xLCBhbWJpZ3VvdXNfZGljdF8yKToKICAgICMgZ2V0IHRleHRbMF0sIGNoZWNrIHBhdHRlcm4KICAgICMgaW4gdGhpcyBjYXNlLCBzaG91bGQgYmUgbGV0dGVyLCBpZiBubywgdHJ5IHRvIHJlcGxhY2UgZnJvbSBkaWN0LCBpZiB5ZXMsIHBhc3MKICAgICMgY29udGludWUgd2l0aCBuZXh0IGxldHRlcnMgdW50aWwgYSBtYXRjaCBpcyBmb3VuZCBvciBsb29wZWQgdGhlIHdob2xlIHRleHQKICAgIG1hdGNoZXMgPSBsaXN0KHJlLmZpbmRpdGVyKHBhdHRlcm4sIHRleHQpKQogICAgaWYgbGVuKG1hdGNoZXMpOgogICAgICAgIHJldHVybiBbZiJ7eC5ncm91cCgxKS50cmFuc2xhdGUoYW1iaWd1b3VzX2RpY3RfMSl9e3guZ3JvdXAoMikudHJhbnNsYXRlKGFtYmlndW91c19kaWN0XzIpfSIgZm9yIHggaW4gbWF0Y2hlc10KICAgIGVsc2U6CiAgICAgICAgcmV0dXJuIE5vbmUKCmFtYmlndW91c19kaWN0XzEgPSB7b3JkKCcyJyk6ICdaJ30gIyBGb3IgdGhlIGZpcnN0IGdyb3VwCmFtYmlndW91c19kaWN0XzIgPSB7b3JkKCdCJyk6ICc4J30gIyBGb3IgdGhlIHNlY29uZCBncm91cAoKIyBNeSBwbGF0ZSBwaG90byB0ZXh0OiBBWjQ1Mjg3CiMgTm9pc2UgaXMgZmFpcmx5IGVhc3kgdG8gZmlsdGVyIG91dCBieSBmaWx0ZXJpbmcgb24gdGVzc2VyYWN0IGNvbmZpZGVuY2UgbGV2ZWwsIGFsdGhvdWdoIG5vdCBpZGVhbAojIHNvLCBpZiBhIGZ1bmN0aW9uIGNhbm5vdCBiZSBtYWRlIHRoYXQgd291bGQgZmluZCBhIG1hdGNoIHRocm91Z2ggdGhlIG5vaXNlCiMgdGhlIG5vaXNlIGNhbiBiZSBpZ25vcmVkIGluIGZhdm9yIG9mIGEgc2ltcGxlciBmdWNudGlvbiB0aGF0IGNhbiBqdXN0IGZpbmQgYSBtYXRjaCAgCm9jcl9vdXRwdXQgPSAic29tZU5vaXNlIEEyNDUyQjcgbm8xWmUiIAoKIyAyIGlzIHJlcGxhY2VkIGJ5IFosIEIgaXMgcmVwbGFjZWQgYnkgOC4gSXQgd291bGQgYmUgYWNjZXB0YWJsZSBpZiB0aGUgZnVuY3Rpb24gd2lsbAojIGRvIHRoaXMgaXRlcmF0aXZlbHkgZm9yIGVhY2ggZWxlbWVudCBvZiBvY3Jfb3V0cHV0IHVudGlsIHBhdHRlcm4gaXMgbWF0Y2hlZCBvciByZXR1cm4gTm9uZQojIEFueSBvdGhlciBmdW5jdGlvbmFsbHkgc2ltaWxhciAocmVjdXJzaXZlLCBnZW5lcmF0b3IsIG90aGVyKSBhcHByb2FjaCBpcyBhbHNvIGFjY2VwdGFibGUuIApyZXN1bHQgPSBwb3N0X3Byb2Nlc3MociIoW0EtWjJdezJ9KShbXGRCXXs1fSkiLCBvY3Jfb3V0cHV0LCBhbWJpZ3VvdXNfZGljdF8xLCBhbWJpZ3VvdXNfZGljdF8yKQoKaWYgcmVzdWx0OgogICAgcHJpbnQocmVzdWx0KSAjIEFaNDUyODcKZWxzZTogIyByZXN1bHQgaXMgbm9uZQogICAgcHJpbnQoImZhaWxlZCB0byBjbGVhbiBvdXRwdXQiKQoK