From d8caa061d17df3a6eafe96f37b45f5da65f299ee Mon Sep 17 00:00:00 2001 From: Rushabh Mehta Date: Mon, 24 Feb 2014 18:57:13 +0530 Subject: [PATCH] sync statics spooks --- frappe/utils/backups.py | 2 +- frappe/utils/email_lib/html2text.py | 806 +++++++++++++++----- frappe/website/doctype/web_page/web_page.py | 64 +- 3 files changed, 655 insertions(+), 217 deletions(-) mode change 100644 => 100755 frappe/utils/email_lib/html2text.py diff --git a/frappe/utils/backups.py b/frappe/utils/backups.py index fa0173e3a9..8d28ca29b1 100644 --- a/frappe/utils/backups.py +++ b/frappe/utils/backups.py @@ -82,7 +82,7 @@ class BackupGenerator: return (backup_path_db, backup_path_files) def zip_files(self): - files_path = frappe.utils.get_site_path(conf.files_path) + files_path = frappe.get_site_path("public", "files") cmd_string = """tar -cf %s %s""" % (self.backup_path_files, files_path) err, out = frappe.utils.execute_in_shell(cmd_string) diff --git a/frappe/utils/email_lib/html2text.py b/frappe/utils/email_lib/html2text.py old mode 100644 new mode 100755 index b7f8062f75..d21a0b1a21 --- a/frappe/utils/email_lib/html2text.py +++ b/frappe/utils/email_lib/html2text.py @@ -1,10 +1,6 @@ -# Copyright (c) 2013, Web Notes Technologies Pvt. Ltd. and Contributors -# MIT License. See license.txt - #!/usr/bin/env python -from __future__ import unicode_literals """html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "3.02" +__version__ = "3.200.3" __author__ = "Aaron Swartz (me@aaronsw.com)" __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] @@ -42,6 +38,9 @@ except: pass # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 +# Escape all special characters. Output is less readable, but avoids corner case formatting issues. +ESCAPE_SNOB = 0 + # Put the links after each paragraph instead of at the end. LINKS_EACH_PARAGRAPH = 0 @@ -50,7 +49,17 @@ BODY_WIDTH = 78 # Don't show internal links (href="#local-anchor") -- corresponding link targets # won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False +SKIP_INTERNAL_LINKS = True + +# Use inline, rather than reference, formatting for images and links +INLINE_LINKS = True + +# Number of pixels Google indents nested lists +GOOGLE_LIST_INDENT = 36 + +IGNORE_ANCHORS = False +IGNORE_IMAGES = False +IGNORE_EMPHASIS = False ### Entity Nonsense ### @@ -63,56 +72,21 @@ def name2cp(k): if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0]) -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', +unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', +'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', +'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} +'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', +'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u', +'lrm':'', 'rlm':''} unifiable_n = {} for k in unifiable.keys(): unifiable_n[name2cp(k)] = unifiable[k] -def charref(name): - if name[0] in ['x','X']: - c = int(name[1:], 16) - else: - c = int(name) - - if not UNICODE_SNOB and c in unifiable_n.keys(): - return unifiable_n[c] - else: - try: - return unichr(c) - except NameError: #Python3 - return chr(c) - -def entityref(c): - if not UNICODE_SNOB and c in unifiable.keys(): - return unifiable[c] - else: - try: name2cp(c) - except KeyError: return "&" + c + ';' - else: - try: - return unichr(name2cp(c)) - except NameError: #Python3 - return chr(name2cp(c)) - -def replaceEntities(s): - s = s.group(1) - if s[0] == "#": - return charref(s[1:]) - else: return entityref(s) - -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") -def unescape(s): - return r_unescape.sub(replaceEntities, s) - ### End Entity Nonsense ### def onlywhite(line): @@ -122,31 +96,6 @@ def onlywhite(line): return c is ' ' return line -def optwrap(text): - """Wrap all paragraphs in the provided text.""" - if not BODY_WIDTH: - return text - - assert wrap, "Requires Python 2.3." - result = '' - newlines = 0 - for para in text.split("\n"): - if len(para) > 0: - if para[0] != ' ' and para[0] != '-' and para[0] != '*': - for line in wrap(para, BODY_WIDTH): - result += line + "\n" - result += "\n" - newlines = 2 - else: - if not onlywhite(para): - result += para + "\n" - newlines = 1 - else: - if newlines < 2: - result += "\n" - newlines += 1 - return result - def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: @@ -154,70 +103,200 @@ def hn(tag): if n in range(1, 10): return n except ValueError: return 0 -class _html2text(HTMLParser.HTMLParser): +def dumb_property_dict(style): + """returns a hash of css attributes""" + return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]); + +def dumb_css_parser(data): + """returns a hash of css selectors, each of which contains a hash of css attributes""" + # remove @import sentences + data += ';' + importIndex = data.find('@import') + while importIndex != -1: + data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] + importIndex = data.find('@import') + + # parse the css. reverted from dictionary compehension in order to support older pythons + elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] + try: + elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) + except ValueError: + elements = {} # not that important + + return elements + +def element_style(attrs, style_def, parent_style): + """returns a hash of the 'final' style attributes of the element""" + style = parent_style.copy() + if 'class' in attrs: + for css_class in attrs['class'].split(): + css_style = style_def['.' + css_class] + style.update(css_style) + if 'style' in attrs: + immediate_style = dumb_property_dict(attrs['style']) + style.update(immediate_style) + return style + +def google_list_style(style): + """finds out whether this is an ordered or unordered list""" + if 'list-style-type' in style: + list_style = style['list-style-type'] + if list_style in ['disc', 'circle', 'square', 'none']: + return 'ul' + return 'ol' + +def google_has_height(style): + """check if the style of the element has the 'height' attribute explicitly defined""" + if 'height' in style: + return True + return False + +def google_text_emphasis(style): + """return a list of all emphasis modifiers of the element""" + emphasis = [] + if 'text-decoration' in style: + emphasis.append(style['text-decoration']) + if 'font-style' in style: + emphasis.append(style['font-style']) + if 'font-weight' in style: + emphasis.append(style['font-weight']) + return emphasis + +def google_fixed_width_font(style): + """check if the css of the current element defines a fixed width font""" + font_family = '' + if 'font-family' in style: + font_family = style['font-family'] + if 'Courier New' == font_family or 'Consolas' == font_family: + return True + return False + +def list_numbering_start(attrs): + """extract numbering from list element attributes""" + if 'start' in attrs: + return int(attrs['start']) - 1 + else: + return 0 + +class HTML2Text(HTMLParser.HTMLParser): def __init__(self, out=None, baseurl=''): HTMLParser.HTMLParser.__init__(self) - - if out is None: self.out = self.outtextf - else: self.out = out + + # Config options + self.unicode_snob = UNICODE_SNOB + self.escape_snob = ESCAPE_SNOB + self.links_each_paragraph = LINKS_EACH_PARAGRAPH + self.body_width = BODY_WIDTH + self.skip_internal_links = SKIP_INTERNAL_LINKS + self.inline_links = INLINE_LINKS + self.google_list_indent = GOOGLE_LIST_INDENT + self.ignore_links = IGNORE_ANCHORS + self.ignore_images = IGNORE_IMAGES + self.ignore_emphasis = IGNORE_EMPHASIS + self.google_doc = False + self.ul_item_mark = '*' + self.emphasis_mark = '_' + self.strong_mark = '**' + + if out is None: + self.out = self.outtextf + else: + self.out = out + + self.outtextlist = [] # empty list to store output characters before they are "joined" + try: self.outtext = unicode() - except NameError: # Python3 + except NameError: # Python3 self.outtext = str() + self.quiet = 0 - self.p_p = 0 + self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] + self.maybe_automatic_link = None + self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 + self.code = False + self.br_toggle = '' self.lastWasNL = 0 - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later + self.lastWasList = False + self.style = 0 + self.style_def = {} + self.tag_stack = [] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl - - def outtextf(self, s): - self.outtext += s - + + try: del unifiable_n[name2cp('nbsp')] + except KeyError: pass + unifiable['nbsp'] = ' _place_holder;' + + + def feed(self, data): + data = data.replace("", "") + HTMLParser.HTMLParser.feed(self, data) + + def handle(self, data): + self.feed(data) + self.feed("") + return self.optwrap(self.close()) + + def outtextf(self, s): + self.outtextlist.append(s) + if s: self.lastWasNL = s[-1] == '\n' + def close(self): HTMLParser.HTMLParser.close(self) - + self.pbr() self.o('', 0, 'end') - + + self.outtext = self.outtext.join(self.outtextlist) + if self.unicode_snob: + nbsp = unichr(name2cp('nbsp')) + else: + nbsp = u' ' + self.outtext = self.outtext.replace(u' _place_holder;', nbsp) + return self.outtext - + def handle_charref(self, c): - self.o(charref(c)) + self.o(self.charref(c), 1) def handle_entityref(self, c): - self.o(entityref(c)) - + self.o(self.entityref(c), 1) + def handle_starttag(self, tag, attrs): self.handle_tag(tag, attrs, 1) - + def handle_endtag(self, tag): self.handle_tag(tag, None, 0) - + def previousIndex(self, attrs): """ returns the index of certain set of attributes (of a link) in the self.a list - + If the set of attributes is not found, returns None """ if not has_key(attrs, 'href'): return None - + i = -1 for a in self.a: i += 1 match = 0 - + if has_key(a, 'href') and a['href'] == attrs['href']: if has_key(a, 'title') or has_key(attrs, 'title'): if (has_key(a, 'title') and has_key(attrs, 'title') and @@ -228,15 +307,114 @@ class _html2text(HTMLParser.HTMLParser): if match: return i + def drop_last(self, nLetters): + if not self.quiet: + self.outtext = self.outtext[:-nLetters] + + def handle_emphasis(self, start, tag_style, parent_style): + """handles various text emphases""" + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough + bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis + italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis + fixed = google_fixed_width_font(tag_style) and not \ + google_fixed_width_font(parent_style) and not self.pre + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o('`') + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = 0 + self.outtext = self.outtext.rstrip() + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o('`') + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(2) + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_last(1) + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + def handle_tag(self, tag, attrs, start): #attrs = fixattrs(attrs) - + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = self.tag_stack.pop() + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + if hn(tag): self.p() - if start: self.o(hn(tag)*"#" + ' ') + if start: + self.inheader = True + self.o(hn(tag)*"#" + ' ') + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ['p', 'div']: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + else: + self.p() - if tag in ['p', 'div']: self.p() - if tag == "br" and start: self.o(" \n") if tag == "hr" and start: @@ -244,30 +422,41 @@ class _html2text(HTMLParser.HTMLParser): self.o("* * *") self.p() - if tag in ["head", "style", 'script']: + if tag in ["head", "style", 'script']: if start: self.quiet += 1 else: self.quiet -= 1 + if tag == "style": + if start: self.style += 1 + else: self.style -= 1 + if tag in ["body"]: self.quiet = 0 # sites like 9rules.com never close - + if tag == "blockquote": - if start: + if start: self.p(); self.o('> ', 0, 1); self.start = 1 self.blockquote += 1 else: self.blockquote -= 1 self.p() - - if tag in ['em', 'i', 'u']: self.o("_") - if tag in ['strong', 'b']: self.o("**") - if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` + + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o(self.emphasis_mark) + if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o(self.strong_mark) + if tag in ['del', 'strike', 's']: + if start: + self.o("<"+tag+">") + else: + self.o("") + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` `` if tag == "abbr": if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - self.abbr_title = None self.abbr_data = '' if has_key(attrs, 'title'): @@ -277,80 +466,93 @@ class _html2text(HTMLParser.HTMLParser): self.abbr_list[self.abbr_data] = self.abbr_title self.abbr_title = None self.abbr_data = '' - - if tag == "a": + + if tag == "a" and not self.ignore_links: if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): + if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): self.astack.append(attrs) - self.o("[") + self.maybe_automatic_link = attrs['href'] else: self.astack.append(None) else: if self.astack: a = self.astack.pop() - if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] + if self.maybe_automatic_link: + self.maybe_automatic_link = None + elif a: + if self.inline_links: + self.o("](" + escape_md(a['href']) + ")") else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + str(a['count']) + "]") - - if tag == "img" and start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + str(a['count']) + "]") + + if tag == "img" and start and not self.ignore_images: if has_key(attrs, 'src'): attrs['href'] = attrs['src'] alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] + self.o("![" + escape_md(alt) + "]") + + if self.inline_links: + self.o("(" + escape_md(attrs['href']) + ")") else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("![") - self.o(alt) - self.o("]["+ str(attrs['count']) +"]") - + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("[" + str(attrs['count']) + "]") + if tag == 'dl' and start: self.p() if tag == 'dt' and not start: self.pbr() if tag == 'dd' and start: self.o(' ') if tag == 'dd' and not start: self.pbr() - + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if (not self.list) and (not self.lastWasList): + self.p() if start: - self.list.append({'name':tag, 'num':0}) + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append({'name':list_style, 'num':numbering_start}) else: if self.list: self.list.pop() - - self.p() - + self.lastWasList = True + else: + self.lastWasList = False + if tag == 'li': + self.pbr() if start: - self.pbr() if self.list: li = self.list[-1] else: li = {'name':'ul', 'num':0} - self.o(" "*len(self.list)) #TODO: line up
  1. s > 9 correctly. - if li['name'] == "ul": self.o("* ") + if self.google_doc: + nest_count = self.google_nest_count(tag_style) + else: + nest_count = len(self.list) + self.o(" " * nest_count) #TODO: line up
    1. s > 9 correctly. + if li['name'] == "ul": self.o(self.ul_item_mark + " ") elif li['name'] == "ol": li['num'] += 1 self.o(str(li['num'])+". ") self.start = 1 - else: - self.pbr() - + if tag in ["table", "tr"] and start: self.p() if tag == 'td': self.pbr() - + if tag == "pre": if start: self.startpre = 1 @@ -358,34 +560,59 @@ class _html2text(HTMLParser.HTMLParser): else: self.pre = 0 self.p() - + def pbr(self): - if self.p_p == 0: self.p_p = 1 + if self.p_p == 0: + self.p_p = 1 + + def p(self): + self.p_p = 2 + + def soft_br(self): + self.pbr() + self.br_toggle = ' ' - def p(self): self.p_p = 2 - def o(self, data, puredata=0, force=0): - if self.abbr_data is not None: self.abbr_data += data - - if not self.quiet: + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != '': + self.drop_white_space = 0 + if puredata and not self.pre: data = re.sub('\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return - + if self.startpre: #self.out(" :") #TODO: not output when already one there - self.startpre = 0 - + if not data.startswith("\n"): #
      stuff...
      +                    data = "\n" + data
      +
                   bq = (">" * self.blockquote)
                   if not (force and data and data[0] == ">") and self.blockquote: bq += " "
      -            
      +
                   if self.pre:
      -                bq += "    "
      +                if not self.list:
      +                    bq += "    "
      +                #else: list content is already partially indented
      +                for i in xrange(len(self.list)):
      +                    bq += "    "
                       data = data.replace("\n", "\n"+bq)
      -            
      +
      +            if self.startpre:
      +                self.startpre = 0
      +                if self.list:
      +                    data = data.lstrip("\n") # use existing initial indentation
      +
                   if self.start:
                       self.space = 0
                       self.p_p = 0
      @@ -397,22 +624,22 @@ class _html2text(HTMLParser.HTMLParser):
                       self.out("\n")
                       self.space = 0
       
      -
                   if self.p_p:
      -                self.out(('\n'+bq)*self.p_p)
      +                self.out((self.br_toggle+'\n'+bq)*self.p_p)
                       self.space = 0
      -                
      +                self.br_toggle = ''
      +
                   if self.space:
                       if not self.lastWasNL: self.out(' ')
                       self.space = 0
       
      -            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
      +            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
                       if force == "end": self.out("\n")
       
                       newa = []
                       for link in self.a:
                           if self.outcount > link['outcount']:
      -                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) 
      +                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
                               if has_key(link, 'title'): self.out(" ("+link['title']+")")
                               self.out("\n")
                           else:
      @@ -421,22 +648,158 @@ class _html2text(HTMLParser.HTMLParser):
                       if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
       
                       self.a = newa
      -            
      +
                   if self.abbr_list and force == "end":
                       for abbr, definition in self.abbr_list.items():
                           self.out("  *[" + abbr + "]: " + definition + "\n")
       
                   self.p_p = 0
                   self.out(data)
      -            self.lastWasNL = data and data[-1] == '\n'
                   self.outcount += 1
       
           def handle_data(self, data):
               if r'\/script>' in data: self.quiet -= 1
      +
      +        if self.style:
      +            self.style_def.update(dumb_css_parser(data))
      +
      +        if not self.maybe_automatic_link is None:
      +            href = self.maybe_automatic_link
      +            if href == data and self.absolute_url_matcher.match(href):
      +                self.o("<" + data + ">")
      +                return
      +            else:
      +                self.o("[")
      +                self.maybe_automatic_link = None
      +
      +        if not self.code and not self.pre:
      +            data = escape_md_section(data, snob=self.escape_snob)
               self.o(data, 1)
      -    
      +
           def unknown_decl(self, data): pass
       
      +    def charref(self, name):
      +        if name[0] in ['x','X']:
      +            c = int(name[1:], 16)
      +        else:
      +            c = int(name)
      +
      +        if not self.unicode_snob and c in unifiable_n.keys():
      +            return unifiable_n[c]
      +        else:
      +            try:
      +                return unichr(c)
      +            except NameError: #Python3
      +                return chr(c)
      +
      +    def entityref(self, c):
      +        if not self.unicode_snob and c in unifiable.keys():
      +            return unifiable[c]
      +        else:
      +            try: name2cp(c)
      +            except KeyError: return "&" + c + ';'
      +            else:
      +                try:
      +                    return unichr(name2cp(c))
      +                except NameError: #Python3
      +                    return chr(name2cp(c))
      +
      +    def replaceEntities(self, s):
      +        s = s.group(1)
      +        if s[0] == "#":
      +            return self.charref(s[1:])
      +        else: return self.entityref(s)
      +
      +    r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
      +    def unescape(self, s):
      +        return self.r_unescape.sub(self.replaceEntities, s)
      +
      +    def google_nest_count(self, style):
      +        """calculate the nesting count of google doc lists"""
      +        nest_count = 0
      +        if 'margin-left' in style:
      +            nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
      +        return nest_count
      +
      +
      +    def optwrap(self, text):
      +        """Wrap all paragraphs in the provided text."""
      +        if not self.body_width:
      +            return text
      +
      +        assert wrap, "Requires Python 2.3."
      +        result = ''
      +        newlines = 0
      +        for para in text.split("\n"):
      +            if len(para) > 0:
      +                if not skipwrap(para):
      +                    result += "\n".join(wrap(para, self.body_width))
      +                    if para.endswith('  '):
      +                        result += "  \n"
      +                        newlines = 1
      +                    else:
      +                        result += "\n\n"
      +                        newlines = 2
      +                else:
      +                    if not onlywhite(para):
      +                        result += para + "\n"
      +                        newlines = 1
      +            else:
      +                if newlines < 2:
      +                    result += "\n"
      +                    newlines += 1
      +        return result
      +
      +ordered_list_matcher = re.compile(r'\d+\.\s')
      +unordered_list_matcher = re.compile(r'[-\*\+]\s')
      +md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
      +md_chars_matcher_all = re.compile(r"([`\*_{}\[\]\(\)#!])")
      +md_dot_matcher = re.compile(r"""
      +    ^             # start of line
      +    (\s*\d+)      # optional whitespace and a number
      +    (\.)          # dot
      +    (?=\s)        # lookahead assert whitespace
      +    """, re.MULTILINE | re.VERBOSE)
      +md_plus_matcher = re.compile(r"""
      +    ^
      +    (\s*)
      +    (\+)
      +    (?=\s)
      +    """, flags=re.MULTILINE | re.VERBOSE)
      +md_dash_matcher = re.compile(r"""
      +    ^
      +    (\s*)
      +    (-)
      +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
      +                  # or another dash (header or hr)
      +    """, flags=re.MULTILINE | re.VERBOSE)
      +slash_chars = r'\`*_{}[]()#+-.!'
      +md_backslash_matcher = re.compile(r'''
      +    (\\)          # match one slash
      +    (?=[%s])      # followed by a char that requires escaping
      +    ''' % re.escape(slash_chars),
      +    flags=re.VERBOSE)
      +
      +def skipwrap(para):
      +    # If the text begins with four spaces or one tab, it's a code block; don't wrap
      +    if para[0:4] == '    ' or para[0] == '\t':
      +        return True
      +    # If the text begins with only two "--", possibly preceded by whitespace, that's
      +    # an emdash; so wrap.
      +    stripped = para.lstrip()
      +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
      +        return False
      +    # I'm not sure what this is for; I thought it was to detect lists, but there's
      +    # a 
      -inside- case in one of the tests that also depends upon it. + if stripped[0:1] == '-' or stripped[0:1] == '*': + return True + # If the text begins with a single -, *, or +, followed by a space, or an integer, + # followed by a ., followed by a space (in either case optionally preceeded by + # whitespace), it's a list; don't wrap. + if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped): + return True + return False + def wrapwrite(text): text = text.encode('utf-8') try: #Python3 @@ -444,25 +807,61 @@ def wrapwrite(text): except AttributeError: sys.stdout.write(text) -def html2text_file(html, out=wrapwrite, baseurl=''): - h = _html2text(out, baseurl) - h.feed(html) - h.feed("") - return h.close() - def html2text(html, baseurl=''): - txt = html2text_file(html, None, baseurl) - return optwrap(txt) #.encode('utf-8')) - -if __name__ == "__main__": + h = HTML2Text(baseurl=baseurl) + return h.handle(html) + +def unescape(s, unicode_snob=False): + h = HTML2Text() + h.unicode_snob = unicode_snob + return h.unescape(s) + +def escape_md(text): + """Escapes markdown-sensitive characters within other markdown constructs.""" + return md_chars_matcher.sub(r"\\\1", text) + +def escape_md_section(text, snob=False): + """Escapes markdown-sensitive characters across whole document sections.""" + text = md_backslash_matcher.sub(r"\\\1", text) + if snob: + text = md_chars_matcher_all.sub(r"\\\1", text) + text = md_dot_matcher.sub(r"\1\\\2", text) + text = md_plus_matcher.sub(r"\1\\\2", text) + text = md_dash_matcher.sub(r"\1\\\2", text) + return text + + +def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) - args = p.parse_args()[1] + p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", + default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis") + p.add_option("--ignore-links", dest="ignore_links", action="store_true", + default=IGNORE_ANCHORS, help="don't include any formatting for links") + p.add_option("--ignore-images", dest="ignore_images", action="store_true", + default=IGNORE_IMAGES, help="don't include any formatting for images") + p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", + default=False, help="convert an html-exported Google Document") + p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", + default=False, help="use a dash rather than a star for unordered list items") + p.add_option("-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", + default=False, help="use an asterisk rather than an underscore for emphasized text") + p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", + default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap") + p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", + default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") + p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", + default=False, help="hide strike-through text. only relevant when -g is specified as well") + p.add_option("--escape-all", action="store_true", dest="escape_snob", + default=False, help="Escape all special characters. Output is less readable, but avoids corner case formatting issues.") + (options, args) = p.parse_args() + + # process input + encoding = "utf-8" if len(args) > 0: file_ = args[0] - encoding = None if len(args) == 2: encoding = args[1] if len(args) > 2: @@ -471,17 +870,15 @@ if __name__ == "__main__": if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) - text = j.read() + data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) - encoding = enc(j.headers, text)[0] + encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' - data = text.decode(encoding) - else: data = open(file_, 'rb').read() if encoding is None: @@ -490,7 +887,28 @@ if __name__ == "__main__": except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] - data = data.decode(encoding) else: data = sys.stdin.read() - wrapwrite(html2text(data, baseurl)) + + data = data.decode(encoding) + h = HTML2Text(baseurl=baseurl) + # handle options + if options.ul_style_dash: h.ul_item_mark = '-' + if options.em_style_asterisk: + h.emphasis_mark = '*' + h.strong_mark = '__' + + h.body_width = options.body_width + h.list_indent = options.list_indent + h.ignore_emphasis = options.ignore_emphasis + h.ignore_links = options.ignore_links + h.ignore_images = options.ignore_images + h.google_doc = options.google_doc + h.hide_strikethrough = options.hide_strikethrough + h.escape_snob = options.escape_snob + + wrapwrite(h.handle(data)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/frappe/website/doctype/web_page/web_page.py b/frappe/website/doctype/web_page/web_page.py index 2dd11af948..f4ae972dd2 100644 --- a/frappe/website/doctype/web_page/web_page.py +++ b/frappe/website/doctype/web_page/web_page.py @@ -40,6 +40,7 @@ def sync_statics(): while True: _sync_statics() frappe.conn.commit() + print "sync complete" time.sleep(2) def _sync_statics(): @@ -48,9 +49,13 @@ def _sync_statics(): def sync_file(fname, fpath, statics_path, priority=0): url = os.path.relpath(fpath, statics_path).rsplit(".", 1)[0] + if fname.rsplit(".", 1)[0]=="index" and os.path.dirname(fpath) != statics_path: url = os.path.dirname(url) - + + if url in synced: + return + parent_website_route = os.path.dirname(url) page_name = os.path.basename(url) @@ -61,7 +66,7 @@ def _sync_statics(): title, content = get_static_content(fpath) if not title: title = page_name.replace("-", " ").replace("_", " ").title() - to_insert.append([frappe.bean({ + page = frappe.bean({ "doctype":"Web Page", "idx": priority, "title": title, @@ -69,8 +74,17 @@ def _sync_statics(): "main_section": content, "published": 1, "parent_website_route": parent_website_route - }), os.path.getmtime(fpath)]) - + }) + + page.insert() + + # update timestamp + sitemap = frappe.bean("Website Route", {"ref_doctype": "Web Page", + "docname": page.doc.name}) + sitemap.doc.static_file_timestamp = os.path.getmtime(fpath) + sitemap.save() + + synced.append(url) else: if str(os.path.getmtime(fpath))!=sitemap.doc.static_file_timestamp \ or cint(sitemap.doc.idx) != cint(priority): @@ -113,12 +127,29 @@ def _sync_statics(): if not has_index: continue - # other files - for fname in files: - page_name = fname.rsplit(".", 1)[0] - if not (page_name=="index" and basepath!=statics_path): - sync_file(fname, os.path.join(basepath, fname), statics_path, - index.index(page_name) if page_name in index else 0) + if index: + # index.txt given + for i, page_name in enumerate(index): + if page_name in folders: + # for folder, sync inner index first (so that idx is set) + for extn in ("md", "html"): + path = os.path.join(basepath, page_name, "index." + extn) + if os.path.exists(path): + sync_file("index." + extn, path, statics_path, i) + break + + # other files + for extn in ("md", "html"): + path = os.path.join(basepath, page_name + "." + extn) + if page_name + "." + extn in files: + sync_file(page_name + "." + extn, path, statics_path, i) + break + + else: + for fname in files: + page_name = fname.rsplit(".", 1)[0] + if not (page_name=="index" and basepath!=statics_path): + sync_file(fname, os.path.join(basepath, fname), statics_path, None) # delete not synced if synced: @@ -128,18 +159,7 @@ def _sync_statics(): else: frappe.delete_doc("Web Page", frappe.conn.sql_list("""select docname from `tabWebsite Route` where ifnull(static_file_timestamp,'')!='' order by (rgt-lft) asc""")) - - - # insert - for page, mtime in to_insert: - page.insert() - - # update timestamp - sitemap = frappe.bean("Website Route", {"ref_doctype": "Web Page", - "docname": page.doc.name}) - sitemap.doc.static_file_timestamp = mtime - sitemap.save() - + def get_static_content(fpath): with open(fpath, "r") as contentfile: