fork download
  1. # example: I am extracting car plate numbers that always follow patern [A-Z]{2}\d{5}
  2. # patterns might differ for other example, but will always be some alfa-numeric combination
  3. # complex patterns may be ignored with some warning like "unable to parse"
  4.  
  5. import re
  6. def post_process(pattern, text, ambiguous_dict_1, ambiguous_dict_2):
  7. # get text[0], check pattern
  8. # in this case, should be letter, if no, try to replace from dict, if yes, pass
  9. # continue with next letters until a match is found or looped the whole text
  10. matches = list(re.finditer(pattern, text))
  11. if len(matches):
  12. return [f"{x.group(1).translate(ambiguous_dict_1)}{x.group(2).translate(ambiguous_dict_2)}" for x in matches]
  13. else:
  14. return None
  15.  
  16. ambiguous_dict_1 = {ord('2'): 'Z'} # For the first group
  17. ambiguous_dict_2 = {ord('B'): '8'} # For the second group
  18.  
  19. # My plate photo text: AZ45287
  20. # Noise is fairly easy to filter out by filtering on tesseract confidence level, although not ideal
  21. # so, if a function cannot be made that would find a match through the noise
  22. # the noise can be ignored in favor of a simpler fucntion that can just find a match
  23. ocr_output = "someNoise A2452B7 no1Ze"
  24.  
  25. # 2 is replaced by Z, B is replaced by 8. It would be acceptable if the function will
  26. # do this iteratively for each element of ocr_output until pattern is matched or return None
  27. # Any other functionally similar (recursive, generator, other) approach is also acceptable.
  28. result = post_process(r"([A-Z2]{2})([\dB]{5})", ocr_output, ambiguous_dict_1, ambiguous_dict_2)
  29.  
  30. if result:
  31. print(result) # AZ45287
  32. else: # result is none
  33. print("failed to clean output")
  34.  
  35.  
Success #stdin #stdout 0.03s 9604KB
stdin
Standard input is empty
stdout
['AZ45287']