Version 3 (modified by xkjq, 2 years ago)

--

!PasteHTML

Add the ability to paste formated text from programs that support copying to HTML.

Notes

  • The script is set up for the default wikidpad parser (though it is customizable if you use a custom one or you don't like the default settings)
  • Its still early days so some stuff may not be included or handled incorrectly. If so let me know and I'll look at fixing it
  • I've tested this mainly with content from wikipedia, whilst it should work with any (valid) html your results may vary.

Requirements

  • gtk

Installation

As usually save to a file in your user_extensions folder.

Code

########################################################################
#
# PasteHtml.py
# ------------
# HTML parser for Wikidpad
#
# v0.1 by xkjq@ymail.com
#
# A plugin that retrieves HTML text from the clipboard and converts it
# to a wikidpad compatable format.
#
# Unfortunately wxpythons clipboard support appears to be a bit lacking
# (or I'm just unable to work it out) so gtk is required. If someone
# knows of cross platform way to get html text in wxpython (or can
# provide the DataObjects required) please let me know or feel free
# to modify the script yourself.
#
# The script is customisable so it shouldn't be too hard to modify it
# for custom parsers. Beware its still early days and stuff is likely
# to change in future versions.
#
# Note: Formating can only be retrived if it is defined by html tags,
#       CSS formating will be ignored. 
#
# TODO: Clean up the code and maybe even give the script a nice GUI
#
########################################################################
#
# The following settings define how the text is formated. If the
# defaults are not to your liking they can be changed here.
# 
# If you use a custom parser chances are you *will* need to change this.
#
#
# START CONFIGURATION
#
# Some html tags have no equivelent in the default wikiParser language. 
# If (like me) you haven't got round to adding them to your parser, but
# still want to keep the formating add them below.

tags_to_keep = set(["u", "sup", "sub"])

#
# tags_to_sub defines which html tags should be converted into wikiparser
# format and what their replacements should be
# 
#       html_tag : ("wikiParser_start_tag", "wikidParser_end_tag")
#
# i.e.  u"b" : (u"*", u"*") 
# would cause   <b>Hello World!</b>
# to become       *Hello World!*
#

tags_to_sub = {
                u"b" : (u"*", u"*"),
                u"strong" : (u"*", u"*"),
                u"i" : (u"_", u"_"),
                u"em" : (u"_", u"_"),

                u"p" : (u"\n", u"\n"),

                u"h1" : (u"\n++", u"\n"),
                u"h2" : (u"\n+++", u"\n"),
                u"h3" : (u"\n++++", u"\n"),
                u"h4" : (u"\n+++++", u"\n"),
                u"h5" : (u"\n++++++", u"\n"),
                u"h6" : (u"\n+++++++", u"\n"),
                u"h7" : (u"\n++++++++", u"\n"),
                u"h8" : (u"\n+++++++++", u"\n"),

                u"br" : (u"\n", u""),

                # Links
                u"a" : (u"[", u"]")
                }

# 
# This defines how tables should be formated. If we are in a table
# the script will check for tags in here prior to those defined
# above.

format_table = {
                u"table": (u"\n<<|\n", u">>\n"),
                u"tr": (u"", u"\n"),
                u"td": (u"|", u""),
                # th is used in the header row. Currently wikipad, default parser
                # at least, does not distinguish the table header.
                u"th": (u"|", u""),

                # Text in caption will be inserted just above table
                u"caption": (u"", u""),

                # Previously defined tags can be overriden whilst 
                # inside tables. i.e. br should become "\newline"
                # instead of just "newline" for the default parser
                u"br" : (u"\\\n", u""), 

}

#
# Setting variable below to True would cause table formating to be 
# ignored if no table start tag has been encountered.

ignore_table_formatting_when_not_in_table = False

# If table not open will open table on any tag present in
# format_table (and not defined below)

open_tables_if_needed = True # will override 
# ignore_table_formatting_when_not_in_table

# Only these tags can start a table automatically
table_start_tags = set([u"tr", u"td", u"th", u"caption"]) 

close_open_tables = True # Useful when only pasting part of a table

# Defines what formating from format_table is used when opening a
# table. Probably best to leave this as default
table_start = u"table" #

# If true tags which have no associated text and are not specified 
# below will be ignored. Can solve some issues but may create others.
# i.e.  <h2></h2> will be ignored
#       <h2>TEST</h2> would not
ignore_empty_tags = True
allowed_empty_tags = set(["img", "p", "br"]) # Should any others be in here?

# List tags defined below will be converted to wikidpad format.
# The unordered list start tag (default: *) can be customized, ordered
# lists cannot.

lists_format = {
    "ul" : "*",
    "ol" : "ordered" # Not customisable 
}

#
#
# spacer defines the seperator to be used for lists
# i.e use
# spacer = "\t"
# if you want to use tab indented lists

spacer = "    "

#
# If allow_extra_ol_formats is false all ordered lists will be converted 
# to numbered lists. If true the list type will be maintained (I don't
# think the default parser supports this at the moment). This includes
# alphabetical and roman numeral based lists

allow_extra_ol_formats = True

#
#
# If true, images will be replaced by their src attribute (i.e. their
# url). If false images are ignored. As such images should function
# correctly in preview mode (providing the link is absolute not
# relative?).
# TODO: Add option to save image locally

add_image_src = True

# If set image size is set automatically for the images
maintain_image_resizes = True

# Is this customisable?? If so what about s/r/a etc?
wikipad_url_appendix_delimiter = ">"

#
# URL handling
# If true all anchors will be converted to wikipad format

maintain_links = False
wikipad_link_delimiter = "|"

#
#
# pre_blocks_maintain_formating: if set to true formating within pre tags 
# will be maintained and surrounded with tags defined in pre_block_tags. 
# TODO: make a few changes this up so its actually useful

pre_blocks_maintain_formating = False
pre_block_tags = ("<<pre\n", "\n>>")

#
# Custom replaces
# ---------------
# These are custom rules to be run on the text. Can be handy to remove 
# unwanted formating (from sites such as wikipedia etc.) automatically.
#
# custom_replace is a standard python replace(a, b) function where all 
# occurances of a are replaced by b.
# custom_replace_regex a re.sub.

enable_custom_replace = True

custom_replace = [
    ("[edit]", ""), # Remove some unwanted wikipedia formating
    ("<sup>[_citation needed_]</sup>", ""),
    ]

custom_replace_regex = [
    (r"<sup>(\[\d*\])?</sup>", ""), # Remove wikipedia refs, e.g. [2]
    ]
# 
#
# END SCRIPT CONFIGURATION
# You shouldn't (!) have to edit anything below here
########################################################################

import sys, wx, re

from HTMLParser import HTMLParser

gkt_imported = False

try:
    import gtk, gobject
    gkt_imported = True
except:
    print "PasteHTML Error - Unable to import GTK"

if gkt_imported:
    WIKIDPAD_PLUGIN = (("MenuFunctions",1),)

    def describeMenuItems(wiki):
            global nextnumber
            return ((Paste, "Paste HTML\tCtrl-Shift-V", "Pastes HTML in a wikipad compatable format"),)


# Custom lists conversion
numeral_map = zip(
    (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1),
    ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
)

def intToRoman(i):
    """
    Converts integer to roman numeral
    """
    result = []
    for integer, numeral in numeral_map:
        count = int(i / integer)
        result.append(numeral * count)
        i -= integer * count
    return ''.join(result)

letters = "0ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def intToLetter(i):
    """
    Converts integer to letter format

    Will work up to ZZ (26*26)
    """
    if i > 26:
        f = letters[i // 26]
        s = letters[i % 26]
        return "".join([f, s])
    else:
        return letters[i]


class HTMLStripper(HTMLParser):
    """
    Strips all HTML tags other than those defined in
    tags_to_keep
    """
    def __init__(self):
        self.reset()
        self.fed = []


        self.spacer = spacer

        self.tags_to_keep = tags_to_keep
        self.tags_to_sub = tags_to_sub
        self.lists_format = lists_format

        self.list_structure = []

        self.list_numbers = []

        # Is it realistic to handle tables within tables?
        self.in_table = 0

        
        self.new_table_row = True

        self.last_tag = None

    def handle_starttag(self, tag, attrs):
        """
        Called whenever an opening tag is reach

        Generally we add the wikipad equivilent to the
        list self.fed
        """

        self.last_tag = tag

        if tag == u"img":
            self.handle_image(attrs)
            return


        # If we're in a table check table formatting first
        if tag in format_table:

            if tag == table_start:
                self.in_table += 1
            if tag == u"tr":
                self.new_table_row = True

            if self.in_table < 1: # Table not started
                if open_tables_if_needed and tag in \
                        table_start_tags: # Start table

                    self.fed.append(format_table[table_start][0])
                    self.in_table = 1

                    self.handle_table_tag(tag)
                elif ignore_table_formatting_when_not_in_table:
                    return
            else:
                self.handle_table_tag(tag)


        if tag in self.tags_to_keep:
            self.fed.append(u"<{0}>".format(tag))
            return

        if tag in self.tags_to_sub:
            
            # Parse anchors start tag
            if tag == u"a":
                if maintain_links:
                    link = self.get_attribute(attrs, u"href", u"Unable to find link")

                    self.fed.append("".join([self.tags_to_sub[tag][0], link, wikipad_link_delimiter]))
                return

            self.fed.append(self.tags_to_sub[tag][0])

        # Handle lists
        if tag in self.lists_format:

            list_type = self.get_attribute(attrs, u"type")

            self.list_structure.append((self.lists_format[tag], list_type))
            self.list_numbers.append(0)
        
        if tag == u"li" and len(self.list_structure) > 0: 
            list_item_number = self.list_numbers[-1]+1
            self.list_numbers[-1] = list_item_number

            list_start = self.list_structure[-1][0]

            if list_start == u"ordered": # Ordered list

                n = list_item_number

                if allow_extra_ol_formats:
                    
                    if self.list_structure[-1][1] == u"A": # Capital letters
                        n = intToLetter(list_item_number)

                    if self.list_structure[-1][1] == u"a": # Lower letters
                        n = intToLetter(list_item_number).lower()

                    if self.list_structure[-1][1] == u"I": # Roman letters
                        n = intToRoman(list_item_number)

                    if self.list_structure[-1][1] == u"i": # Roman lower letters
                        n = intToRoman(list_item_number).lower()

                list_start = u"{0}.".format(n)
                

            self.fed.append(u"{0}{1} ".format(self.spacer*len(self.list_structure), list_start))
           
            
    def handle_startendtag(self, tag, attrs):
        """
        Self closing tags are handled here
        e.g.    <img scr="...." />
                <br />

        For now just redirects to handle_starttag()
        
        Are their any situations in which this shouldn't
        happen?
        """
        self.handle_starttag(tag, attrs)

        
    def handle_endtag(self, tag):
        """
        Called as the tag is closed
        """
        
        if ignore_empty_tags:
            if tag == self.last_tag and tag not in allowed_empty_tags:
                del self.fed[-1]
                self.last_tag = None
                return

        if tag in format_table:
            if tag == u"caption":
                self.fed.append(format_table[table_start][0])
                return
            if tag == table_start:
                self.in_table -= 1
            self.fed.append(format_table[tag][1])

        if tag == u"a":
            if maintain_links:
                self.fed.append(self.tags_to_sub[tag][1])
            return
                
        if tag in self.tags_to_keep:
            self.fed.append(u"</{0}>".format(tag))
            return

        if tag in self.tags_to_sub:

            self.fed.append(self.tags_to_sub[tag][1])

        if tag == u"li" and len(self.list_structure) > 0: 
            self.fed.append(u"\n")

        if tag in self.lists_format:
            del self.list_structure[-1]

        
    def handle_data(self, d):
        self.last_tag = None
        self.fed.append(d)

    def get_data(self):
        if close_open_tables:
            while self.in_table > 0:
                self.fed.append(format_table[table_start][1])
                self.in_table -= 1
        return u''.join(self.fed)

    def handle_table_tag(self, tag):

        # Captions need to be inserted before the table start
        if tag ==  u"caption":
            # Caption should always be immediatly after table start
            # so we can just delete the last item, add the caption
            # and start the table again on its close tag
            del self.fed[-1]
            return

        # Special case is needed for td
        if tag == u"td" or tag == u"th":
            if not self.new_table_row:
                # need spaces or blank cell will be ignored
                self.fed.append(u" {0} ".format(format_table[tag][0]))
                return
            else:
                self.new_table_row = False
                return
        else:
            self.fed.append(format_table[tag][0])

    def handle_image(self, attrs):
        if add_image_src:
            appendix = ""
            if maintain_image_resizes:
                width = self.get_attribute(attrs, "width", False)
                height = self.get_attribute(attrs, "height", False)

                # Currently only deals with img for which both width and height are
                # specified
                if width and height:
                    size_type = "s" # default size in pixels

                    # HTML can handle images with 1 dimension in %
                    # and the other in pixel but wikidpad (default
                    # parser at least) cannot.
                    if width[-1] == "%":
                        size_type = "r"

                    appendix = "".join([wikipad_url_appendix_delimiter, size_type,
                                    width, "x", height, " "])
                    
                
            self.fed.append("".join([self.get_attribute(attrs, "src"), appendix]))

    def get_attribute(self, attrs, attr, not_found=""):
        """
        Loops through all attributes returning the requested one
        if found.
        
        attrs is a list of turples, quotations (") are not included 
        [(name, value), (name2, value2), ...]
        """

        if len(attrs) > 0:
            for name, value in attrs:
                if name == attr:
                    return value
        return not_found


def strip_tags(html):
    s = HTMLStripper()
    s.feed(html)
    return s.get_data()

def getData(d):
    
    # Remove line breaks?
    d = d.replace("\n", "")

    if pre_blocks_maintain_formating:
        # Pre blocks have to be handled before we remove whitespace
        pre_blocks = []
        a = re.search(r"<pre(?:>| [\s\S]*?>)([\s\S]*?)</pre(?:>| [\s\S]*?>)", d)
        while a:
            d = d.replace(a.group(0), "$_PREBLOCK-{0}_$".format(len(pre_blocks)))
            pre_blocks.append(a.group(1)) 
            a = re.search(r"<pre(?:>| [\s\S]*?>)([\s\S]{1,}?)</pre(?:>| [\s\S]*?>)", d)


    # Remove whitespace (ignored in html anyway)
    d = " ".join(d.split())

    d = strip_tags(d)
    
    # Clean up some spaces
    d = d.replace("     ", "    ")


    if pre_blocks_maintain_formating:
        # Add pre blocks back in
        for i in range(len(pre_blocks)):
            d = d.replace("$_PREBLOCK-{0}_$".format(i), "{0}{1}{2}".format(pre_block_tags[0], pre_blocks[i], pre_block_tags[1]))


    # Perform custom replaces (if enabled)
    if enable_custom_replace:
        if len(custom_replace) > 0:
            for a, b in custom_replace:
                d = d.replace(a, b)

        if len(custom_replace_regex) > 0:
            for a, b in custom_replace_regex:
                d = re.sub(a, b, d)



    return d.lstrip()


def Paste(pwiki, evt):

    editor = pwiki.getActiveEditor()
            
    clipboard = gtk.Clipboard()
    targets = clipboard.wait_for_targets()


    if "text/html" in targets:
        contents = clipboard.wait_for_contents("text/html")
        if contents:

            # Firefox data needs to be formated first
            if "text/_moz_htmlinfo" in targets:
                d = contents.data.decode('utf_16').replace('\x00', '').strip()
            else:
                d = contents.data.strip()

            text = getData(d)

            editor.ReplaceSelection(text)
        
    elif "TEXT" in targets:
        text_contents = clipboard.wait_for_contents("TEXT")
        text = text_contents.data

        editor.ReplaceSelection(text)


Attachments