You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html2text.py 15 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. #!/usr/bin/env python
  2. """html2text: Turn HTML into equivalent Markdown-structured text."""
  3. __version__ = "3.02"
  4. __author__ = "Aaron Swartz (me@aaronsw.com)"
  5. __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
  6. __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
  7. # TODO:
  8. # Support decoded entities with unifiable.
  9. try:
  10. True
  11. except NameError:
  12. setattr(__builtins__, 'True', 1)
  13. setattr(__builtins__, 'False', 0)
  14. def has_key(x, y):
  15. if hasattr(x, 'has_key'): return x.has_key(y)
  16. else: return y in x
  17. try:
  18. import htmlentitydefs
  19. import urlparse
  20. import HTMLParser
  21. except ImportError: #Python3
  22. import html.entities as htmlentitydefs
  23. import urllib.parse as urlparse
  24. import html.parser as HTMLParser
  25. try: #Python3
  26. import urllib.request as urllib
  27. except:
  28. import urllib
  29. import optparse, re, sys, codecs, types
  30. try: from textwrap import wrap
  31. except: pass
  32. # Use Unicode characters instead of their ascii psuedo-replacements
  33. UNICODE_SNOB = 0
  34. # Put the links after each paragraph instead of at the end.
  35. LINKS_EACH_PARAGRAPH = 0
  36. # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
  37. BODY_WIDTH = 78
  38. # Don't show internal links (href="#local-anchor") -- corresponding link targets
  39. # won't be visible in the plain text file anyway.
  40. SKIP_INTERNAL_LINKS = False
  41. ### Entity Nonsense ###
  42. def name2cp(k):
  43. if k == 'apos': return ord("'")
  44. if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
  45. return htmlentitydefs.name2codepoint[k]
  46. else:
  47. k = htmlentitydefs.entitydefs[k]
  48. if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
  49. return ord(codecs.latin_1_decode(k)[0])
  50. unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
  51. 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
  52. 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
  53. 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
  54. 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
  55. 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
  56. 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
  57. 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
  58. unifiable_n = {}
  59. for k in unifiable.keys():
  60. unifiable_n[name2cp(k)] = unifiable[k]
  61. def charref(name):
  62. if name[0] in ['x','X']:
  63. c = int(name[1:], 16)
  64. else:
  65. c = int(name)
  66. if not UNICODE_SNOB and c in unifiable_n.keys():
  67. return unifiable_n[c]
  68. else:
  69. try:
  70. return unichr(c)
  71. except NameError: #Python3
  72. return chr(c)
  73. def entityref(c):
  74. if not UNICODE_SNOB and c in unifiable.keys():
  75. return unifiable[c]
  76. else:
  77. try: name2cp(c)
  78. except KeyError: return "&" + c + ';'
  79. else:
  80. try:
  81. return unichr(name2cp(c))
  82. except NameError: #Python3
  83. return chr(name2cp(c))
  84. def replaceEntities(s):
  85. s = s.group(1)
  86. if s[0] == "#":
  87. return charref(s[1:])
  88. else: return entityref(s)
  89. r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
  90. def unescape(s):
  91. return r_unescape.sub(replaceEntities, s)
  92. ### End Entity Nonsense ###
  93. def onlywhite(line):
  94. """Return true if the line does only consist of whitespace characters."""
  95. for c in line:
  96. if c is not ' ' and c is not ' ':
  97. return c is ' '
  98. return line
  99. def optwrap(text):
  100. """Wrap all paragraphs in the provided text."""
  101. if not BODY_WIDTH:
  102. return text
  103. assert wrap, "Requires Python 2.3."
  104. result = ''
  105. newlines = 0
  106. for para in text.split("\n"):
  107. if len(para) > 0:
  108. if para[0] != ' ' and para[0] != '-' and para[0] != '*':
  109. for line in wrap(para, BODY_WIDTH):
  110. result += line + "\n"
  111. result += "\n"
  112. newlines = 2
  113. else:
  114. if not onlywhite(para):
  115. result += para + "\n"
  116. newlines = 1
  117. else:
  118. if newlines < 2:
  119. result += "\n"
  120. newlines += 1
  121. return result
  122. def hn(tag):
  123. if tag[0] == 'h' and len(tag) == 2:
  124. try:
  125. n = int(tag[1])
  126. if n in range(1, 10): return n
  127. except ValueError: return 0
  128. class _html2text(HTMLParser.HTMLParser):
  129. def __init__(self, out=None, baseurl=''):
  130. HTMLParser.HTMLParser.__init__(self)
  131. if out is None: self.out = self.outtextf
  132. else: self.out = out
  133. try:
  134. self.outtext = unicode()
  135. except NameError: # Python3
  136. self.outtext = str()
  137. self.quiet = 0
  138. self.p_p = 0
  139. self.outcount = 0
  140. self.start = 1
  141. self.space = 0
  142. self.a = []
  143. self.astack = []
  144. self.acount = 0
  145. self.list = []
  146. self.blockquote = 0
  147. self.pre = 0
  148. self.startpre = 0
  149. self.lastWasNL = 0
  150. self.abbr_title = None # current abbreviation definition
  151. self.abbr_data = None # last inner HTML (for abbr being defined)
  152. self.abbr_list = {} # stack of abbreviations to write later
  153. self.baseurl = baseurl
  154. def outtextf(self, s):
  155. self.outtext += s
  156. def close(self):
  157. HTMLParser.HTMLParser.close(self)
  158. self.pbr()
  159. self.o('', 0, 'end')
  160. return self.outtext
  161. def handle_charref(self, c):
  162. self.o(charref(c))
  163. def handle_entityref(self, c):
  164. self.o(entityref(c))
  165. def handle_starttag(self, tag, attrs):
  166. self.handle_tag(tag, attrs, 1)
  167. def handle_endtag(self, tag):
  168. self.handle_tag(tag, None, 0)
  169. def previousIndex(self, attrs):
  170. """ returns the index of certain set of attributes (of a link) in the
  171. self.a list
  172. If the set of attributes is not found, returns None
  173. """
  174. if not has_key(attrs, 'href'): return None
  175. i = -1
  176. for a in self.a:
  177. i += 1
  178. match = 0
  179. if has_key(a, 'href') and a['href'] == attrs['href']:
  180. if has_key(a, 'title') or has_key(attrs, 'title'):
  181. if (has_key(a, 'title') and has_key(attrs, 'title') and
  182. a['title'] == attrs['title']):
  183. match = True
  184. else:
  185. match = True
  186. if match: return i
  187. def handle_tag(self, tag, attrs, start):
  188. #attrs = fixattrs(attrs)
  189. if hn(tag):
  190. self.p()
  191. if start: self.o(hn(tag)*"#" + ' ')
  192. if tag in ['p', 'div']: self.p()
  193. if tag == "br" and start: self.o(" \n")
  194. if tag == "hr" and start:
  195. self.p()
  196. self.o("* * *")
  197. self.p()
  198. if tag in ["head", "style", 'script']:
  199. if start: self.quiet += 1
  200. else: self.quiet -= 1
  201. if tag in ["body"]:
  202. self.quiet = 0 # sites like 9rules.com never close <head>
  203. if tag == "blockquote":
  204. if start:
  205. self.p(); self.o('> ', 0, 1); self.start = 1
  206. self.blockquote += 1
  207. else:
  208. self.blockquote -= 1
  209. self.p()
  210. if tag in ['em', 'i', 'u']: self.o("_")
  211. if tag in ['strong', 'b']: self.o("**")
  212. if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
  213. if tag == "abbr":
  214. if start:
  215. attrsD = {}
  216. for (x, y) in attrs: attrsD[x] = y
  217. attrs = attrsD
  218. self.abbr_title = None
  219. self.abbr_data = ''
  220. if has_key(attrs, 'title'):
  221. self.abbr_title = attrs['title']
  222. else:
  223. if self.abbr_title != None:
  224. self.abbr_list[self.abbr_data] = self.abbr_title
  225. self.abbr_title = None
  226. self.abbr_data = ''
  227. if tag == "a":
  228. if start:
  229. attrsD = {}
  230. for (x, y) in attrs: attrsD[x] = y
  231. attrs = attrsD
  232. if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
  233. self.astack.append(attrs)
  234. self.o("[")
  235. else:
  236. self.astack.append(None)
  237. else:
  238. if self.astack:
  239. a = self.astack.pop()
  240. if a:
  241. i = self.previousIndex(a)
  242. if i is not None:
  243. a = self.a[i]
  244. else:
  245. self.acount += 1
  246. a['count'] = self.acount
  247. a['outcount'] = self.outcount
  248. self.a.append(a)
  249. self.o("][" + str(a['count']) + "]")
  250. if tag == "img" and start:
  251. attrsD = {}
  252. for (x, y) in attrs: attrsD[x] = y
  253. attrs = attrsD
  254. if has_key(attrs, 'src'):
  255. attrs['href'] = attrs['src']
  256. alt = attrs.get('alt', '')
  257. i = self.previousIndex(attrs)
  258. if i is not None:
  259. attrs = self.a[i]
  260. else:
  261. self.acount += 1
  262. attrs['count'] = self.acount
  263. attrs['outcount'] = self.outcount
  264. self.a.append(attrs)
  265. self.o("![")
  266. self.o(alt)
  267. self.o("]["+ str(attrs['count']) +"]")
  268. if tag == 'dl' and start: self.p()
  269. if tag == 'dt' and not start: self.pbr()
  270. if tag == 'dd' and start: self.o(' ')
  271. if tag == 'dd' and not start: self.pbr()
  272. if tag in ["ol", "ul"]:
  273. if start:
  274. self.list.append({'name':tag, 'num':0})
  275. else:
  276. if self.list: self.list.pop()
  277. self.p()
  278. if tag == 'li':
  279. if start:
  280. self.pbr()
  281. if self.list: li = self.list[-1]
  282. else: li = {'name':'ul', 'num':0}
  283. self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
  284. if li['name'] == "ul": self.o("* ")
  285. elif li['name'] == "ol":
  286. li['num'] += 1
  287. self.o(str(li['num'])+". ")
  288. self.start = 1
  289. else:
  290. self.pbr()
  291. if tag in ["table", "tr"] and start: self.p()
  292. if tag == 'td': self.pbr()
  293. if tag == "pre":
  294. if start:
  295. self.startpre = 1
  296. self.pre = 1
  297. else:
  298. self.pre = 0
  299. self.p()
  300. def pbr(self):
  301. if self.p_p == 0: self.p_p = 1
  302. def p(self): self.p_p = 2
  303. def o(self, data, puredata=0, force=0):
  304. if self.abbr_data is not None: self.abbr_data += data
  305. if not self.quiet:
  306. if puredata and not self.pre:
  307. data = re.sub('\s+', ' ', data)
  308. if data and data[0] == ' ':
  309. self.space = 1
  310. data = data[1:]
  311. if not data and not force: return
  312. if self.startpre:
  313. #self.out(" :") #TODO: not output when already one there
  314. self.startpre = 0
  315. bq = (">" * self.blockquote)
  316. if not (force and data and data[0] == ">") and self.blockquote: bq += " "
  317. if self.pre:
  318. bq += " "
  319. data = data.replace("\n", "\n"+bq)
  320. if self.start:
  321. self.space = 0
  322. self.p_p = 0
  323. self.start = 0
  324. if force == 'end':
  325. # It's the end.
  326. self.p_p = 0
  327. self.out("\n")
  328. self.space = 0
  329. if self.p_p:
  330. self.out(('\n'+bq)*self.p_p)
  331. self.space = 0
  332. if self.space:
  333. if not self.lastWasNL: self.out(' ')
  334. self.space = 0
  335. if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
  336. if force == "end": self.out("\n")
  337. newa = []
  338. for link in self.a:
  339. if self.outcount > link['outcount']:
  340. self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
  341. if has_key(link, 'title'): self.out(" ("+link['title']+")")
  342. self.out("\n")
  343. else:
  344. newa.append(link)
  345. if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
  346. self.a = newa
  347. if self.abbr_list and force == "end":
  348. for abbr, definition in self.abbr_list.items():
  349. self.out(" *[" + abbr + "]: " + definition + "\n")
  350. self.p_p = 0
  351. self.out(data)
  352. self.lastWasNL = data and data[-1] == '\n'
  353. self.outcount += 1
  354. def handle_data(self, data):
  355. if r'\/script>' in data: self.quiet -= 1
  356. self.o(data, 1)
  357. def unknown_decl(self, data): pass
  358. def wrapwrite(text):
  359. text = text.encode('utf-8')
  360. try: #Python3
  361. sys.stdout.buffer.write(text)
  362. except AttributeError:
  363. sys.stdout.write(text)
  364. def html2text_file(html, out=wrapwrite, baseurl=''):
  365. h = _html2text(out, baseurl)
  366. h.feed(html)
  367. h.feed("")
  368. return h.close()
  369. def html2text(html, baseurl=''):
  370. txt = html2text_file(html.decode('utf-8'), None, baseurl)
  371. return optwrap(txt.encode('utf-8'))
  372. if __name__ == "__main__":
  373. baseurl = ''
  374. p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
  375. version='%prog ' + __version__)
  376. args = p.parse_args()[1]
  377. if len(args) > 0:
  378. file_ = args[0]
  379. encoding = None
  380. if len(args) == 2:
  381. encoding = args[1]
  382. if len(args) > 2:
  383. p.error('Too many arguments')
  384. if file_.startswith('http://') or file_.startswith('https://'):
  385. baseurl = file_
  386. j = urllib.urlopen(baseurl)
  387. text = j.read()
  388. if encoding is None:
  389. try:
  390. from feedparser import _getCharacterEncoding as enc
  391. except ImportError:
  392. enc = lambda x, y: ('utf-8', 1)
  393. encoding = enc(j.headers, text)[0]
  394. if encoding == 'us-ascii':
  395. encoding = 'utf-8'
  396. data = text.decode(encoding)
  397. else:
  398. data = open(file_, 'rb').read()
  399. if encoding is None:
  400. try:
  401. from chardet import detect
  402. except ImportError:
  403. detect = lambda x: {'encoding': 'utf-8'}
  404. encoding = detect(data)['encoding']
  405. data = data.decode(encoding)
  406. else:
  407. data = sys.stdin.read()
  408. wrapwrite(html2text(data, baseurl))