fork download
  1. import re
  2. def clean_text(text):
  3. pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
  4. text = re.sub(pattern=pattern, repl='', string=text)
  5. pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
  6. text = re.sub(pattern=pattern, repl='', string=text)
  7. pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'
  8. text = re.sub(pattern=pattern, repl='', string=text)
  9. pattern = '<[^>]*>'
  10. text = re.sub(pattern=pattern, repl='', string=text)
  11. pattern = '\s*\([^)]*\)'
  12. text = re.sub(pattern=pattern, repl='', string=text)
  13. pattern = '[^\w\s]'
  14. text = re.sub(pattern=pattern, repl='', string=text)
  15. return text.strip()
  16.  
  17. text = '(abc_def) 좋은글! (이것도 지워조) http://1...content-available-to-author-only...4.com 감사합니다. aaa@goggle.comㅋㅋ<H1>thank you</H1>'
  18. print(clean_text(text))
Success #stdin #stdout 0.01s 27800KB
stdin
Standard input is empty
stdout
좋은글  감사합니다 thank you