from lxml.html import fromstring
html = '''
<html>
<head>
<title>TEST</title>
</head>
<body>
<h1>Test tzw. <em>tag soup</em></h1>
<p>Przykładowy paragraf. (brak /p)
<ul>
<li>Pierwszy wpis(brak /li)
<li>Drugi <em>wpis</em></li>
<li>Wiecej wpisow 1</li>
<li>Wiecej wpisow 2</li>
<li>Wiecej wpisow 3(brak /li)
<li>Wiecej wpisow 4</li>
</ul>
<ol>
<li>AA(brak /li)
<li>BB <em>em</em></li>
<li>CC</li>
<li>DD</li>
<li>EE(brak /li)
<li>FF</li>
</ol>
</body>
</html>
'''
h = fromstring(html)
print 'Szukamy elemntow listy numerowanej'
print 'Bez normalizacji:'
for e in h.xpath('//ol/li'):
print '[[' + e.text_content() + ']]'
print 'Z normalizacja:'
for e in h.xpath('//ol/li'):
print '[[' + e.xpath('normalize-space()') + ']]'
ZnJvbSBseG1sLmh0bWwgaW1wb3J0IGZyb21zdHJpbmcKCmh0bWwgPSAnJycKPGh0bWw+Cgo8aGVhZD4KICA8dGl0bGU+VEVTVDwvdGl0bGU+CjwvaGVhZD4KCjxib2R5PgogIDxoMT5UZXN0IHR6dy4gPGVtPnRhZyBzb3VwPC9lbT48L2gxPgogIDxwPlByenlrxYJhZG93eSBwYXJhZ3JhZi4gKGJyYWsgL3ApCiAgPHVsPgogICAgPGxpPlBpZXJ3c3p5IHdwaXMoYnJhayAvbGkpCiAgICA8bGk+RHJ1Z2kgPGVtPndwaXM8L2VtPjwvbGk+CiAgICA8bGk+V2llY2VqIHdwaXNvdyAxPC9saT4KICAgIDxsaT5XaWVjZWogd3Bpc293IDI8L2xpPgogICAgPGxpPldpZWNlaiB3cGlzb3cgMyhicmFrIC9saSkKICAgIDxsaT5XaWVjZWogd3Bpc293IDQ8L2xpPgogIDwvdWw+CgogIDxvbD4KICAgIDxsaT5BQShicmFrIC9saSkKICAgIDxsaT5CQiA8ZW0+ZW08L2VtPjwvbGk+CiAgICA8bGk+Q0M8L2xpPgogICAgPGxpPkREPC9saT4KICAgIDxsaT5FRShicmFrIC9saSkKICAgIDxsaT5GRjwvbGk+CiAgPC9vbD4KPC9ib2R5PgoKPC9odG1sPgonJycKCmggPSBmcm9tc3RyaW5nKGh0bWwpCgpwcmludCAnU3p1a2FteSBlbGVtbnRvdyBsaXN0eSBudW1lcm93YW5laicKcHJpbnQgJ0JleiBub3JtYWxpemFjamk6Jwpmb3IgZSBpbiBoLnhwYXRoKCcvL29sL2xpJyk6CglwcmludCAnW1snICsgZS50ZXh0X2NvbnRlbnQoKSArICddXScKCQpwcmludCAnWiBub3JtYWxpemFjamE6Jwpmb3IgZSBpbiBoLnhwYXRoKCcvL29sL2xpJyk6CglwcmludCAnW1snICsgZS54cGF0aCgnbm9ybWFsaXplLXNwYWNlKCknKSArICddXSc=