fork(1) download
  1. # example: I am extracting car plate numbers that always follow patern [A-Z]{2}\d{5}
  2. # patterns might differ for other example, but will always be some alfa-numeric combination
  3. # complex patterns may be ignored with some warning like "unable to parse"
  4.  
  5. import re
  6. def post_process(pattern, text, ambiguous_dict):
  7. # get text[0], check pattern
  8. # in this case, should be letter, if no, try to replace from dict, if yes, pass
  9. # continue with next letters until a match is found or looped the whole text
  10. matches = re.findall(pattern, text)
  11. if len(matches):
  12. return [x.translate(ambiguous_dict) for x in matches]
  13. else:
  14. return None
  15.  
  16. ambiguous_dict = {ord('2'): 'Z', ord('B'): '8'}
  17.  
  18. # My plate photo text: AZ45287
  19. # Noise is fairly easy to filter out by filtering on tesseract confidence level, although not ideal
  20. # so, if a function cannot be made that would find a match through the noise
  21. # the noise can be ignored in favor of a simpler fucntion that can just find a match
  22. ocr_output = "someNoise A2452B7 no1Ze"
  23.  
  24. # 2 is replaced by Z, B is replaced by 8. It would be acceptable if the function will
  25. # do this iteratively for each element of ocr_output until pattern is matched or return None
  26. # Any other functionally similar (recursive, generator, other) approach is also acceptable.
  27. result = post_process(r"[A-Z2]{2}[\dB]{5}", ocr_output, ambiguous_dict)
  28.  
  29. if result:
  30. print(result) # AZ45287
  31. else: # result is none
  32. print("failed to clean output")
  33.  
  34.  
Success #stdin #stdout 0.03s 9652KB
stdin
Standard input is empty
stdout
['AZ45Z87']