| Version 3 (modified by xkjq, 2 years ago) |
|---|
!PasteHTML
Add the ability to paste formated text from programs that support copying to HTML.
Notes
- The script is set up for the default wikidpad parser (though it is customizable if you use a custom one or you don't like the default settings)
- Its still early days so some stuff may not be included or handled incorrectly. If so let me know and I'll look at fixing it
- I've tested this mainly with content from wikipedia, whilst it should work with any (valid) html your results may vary.
Requirements
- gtk
Installation
As usually save to a file in your user_extensions folder.
Code
########################################################################
#
# PasteHtml.py
# ------------
# HTML parser for Wikidpad
#
# v0.1 by xkjq@ymail.com
#
# A plugin that retrieves HTML text from the clipboard and converts it
# to a wikidpad compatable format.
#
# Unfortunately wxpythons clipboard support appears to be a bit lacking
# (or I'm just unable to work it out) so gtk is required. If someone
# knows of cross platform way to get html text in wxpython (or can
# provide the DataObjects required) please let me know or feel free
# to modify the script yourself.
#
# The script is customisable so it shouldn't be too hard to modify it
# for custom parsers. Beware its still early days and stuff is likely
# to change in future versions.
#
# Note: Formating can only be retrived if it is defined by html tags,
# CSS formating will be ignored.
#
# TODO: Clean up the code and maybe even give the script a nice GUI
#
########################################################################
#
# The following settings define how the text is formated. If the
# defaults are not to your liking they can be changed here.
#
# If you use a custom parser chances are you *will* need to change this.
#
#
# START CONFIGURATION
#
# Some html tags have no equivelent in the default wikiParser language.
# If (like me) you haven't got round to adding them to your parser, but
# still want to keep the formating add them below.
tags_to_keep = set(["u", "sup", "sub"])
#
# tags_to_sub defines which html tags should be converted into wikiparser
# format and what their replacements should be
#
# html_tag : ("wikiParser_start_tag", "wikidParser_end_tag")
#
# i.e. u"b" : (u"*", u"*")
# would cause <b>Hello World!</b>
# to become *Hello World!*
#
tags_to_sub = {
u"b" : (u"*", u"*"),
u"strong" : (u"*", u"*"),
u"i" : (u"_", u"_"),
u"em" : (u"_", u"_"),
u"p" : (u"\n", u"\n"),
u"h1" : (u"\n++", u"\n"),
u"h2" : (u"\n+++", u"\n"),
u"h3" : (u"\n++++", u"\n"),
u"h4" : (u"\n+++++", u"\n"),
u"h5" : (u"\n++++++", u"\n"),
u"h6" : (u"\n+++++++", u"\n"),
u"h7" : (u"\n++++++++", u"\n"),
u"h8" : (u"\n+++++++++", u"\n"),
u"br" : (u"\n", u""),
# Links
u"a" : (u"[", u"]")
}
#
# This defines how tables should be formated. If we are in a table
# the script will check for tags in here prior to those defined
# above.
format_table = {
u"table": (u"\n<<|\n", u">>\n"),
u"tr": (u"", u"\n"),
u"td": (u"|", u""),
# th is used in the header row. Currently wikipad, default parser
# at least, does not distinguish the table header.
u"th": (u"|", u""),
# Text in caption will be inserted just above table
u"caption": (u"", u""),
# Previously defined tags can be overriden whilst
# inside tables. i.e. br should become "\newline"
# instead of just "newline" for the default parser
u"br" : (u"\\\n", u""),
}
#
# Setting variable below to True would cause table formating to be
# ignored if no table start tag has been encountered.
ignore_table_formatting_when_not_in_table = False
# If table not open will open table on any tag present in
# format_table (and not defined below)
open_tables_if_needed = True # will override
# ignore_table_formatting_when_not_in_table
# Only these tags can start a table automatically
table_start_tags = set([u"tr", u"td", u"th", u"caption"])
close_open_tables = True # Useful when only pasting part of a table
# Defines what formating from format_table is used when opening a
# table. Probably best to leave this as default
table_start = u"table" #
# If true tags which have no associated text and are not specified
# below will be ignored. Can solve some issues but may create others.
# i.e. <h2></h2> will be ignored
# <h2>TEST</h2> would not
ignore_empty_tags = True
allowed_empty_tags = set(["img", "p", "br"]) # Should any others be in here?
# List tags defined below will be converted to wikidpad format.
# The unordered list start tag (default: *) can be customized, ordered
# lists cannot.
lists_format = {
"ul" : "*",
"ol" : "ordered" # Not customisable
}
#
#
# spacer defines the seperator to be used for lists
# i.e use
# spacer = "\t"
# if you want to use tab indented lists
spacer = " "
#
# If allow_extra_ol_formats is false all ordered lists will be converted
# to numbered lists. If true the list type will be maintained (I don't
# think the default parser supports this at the moment). This includes
# alphabetical and roman numeral based lists
allow_extra_ol_formats = True
#
#
# If true, images will be replaced by their src attribute (i.e. their
# url). If false images are ignored. As such images should function
# correctly in preview mode (providing the link is absolute not
# relative?).
# TODO: Add option to save image locally
add_image_src = True
# If set image size is set automatically for the images
maintain_image_resizes = True
# Is this customisable?? If so what about s/r/a etc?
wikipad_url_appendix_delimiter = ">"
#
# URL handling
# If true all anchors will be converted to wikipad format
maintain_links = False
wikipad_link_delimiter = "|"
#
#
# pre_blocks_maintain_formating: if set to true formating within pre tags
# will be maintained and surrounded with tags defined in pre_block_tags.
# TODO: make a few changes this up so its actually useful
pre_blocks_maintain_formating = False
pre_block_tags = ("<<pre\n", "\n>>")
#
# Custom replaces
# ---------------
# These are custom rules to be run on the text. Can be handy to remove
# unwanted formating (from sites such as wikipedia etc.) automatically.
#
# custom_replace is a standard python replace(a, b) function where all
# occurances of a are replaced by b.
# custom_replace_regex a re.sub.
enable_custom_replace = True
custom_replace = [
("[edit]", ""), # Remove some unwanted wikipedia formating
("<sup>[_citation needed_]</sup>", ""),
]
custom_replace_regex = [
(r"<sup>(\[\d*\])?</sup>", ""), # Remove wikipedia refs, e.g. [2]
]
#
#
# END SCRIPT CONFIGURATION
# You shouldn't (!) have to edit anything below here
########################################################################
import sys, wx, re
from HTMLParser import HTMLParser
gkt_imported = False
try:
import gtk, gobject
gkt_imported = True
except:
print "PasteHTML Error - Unable to import GTK"
if gkt_imported:
WIKIDPAD_PLUGIN = (("MenuFunctions",1),)
def describeMenuItems(wiki):
global nextnumber
return ((Paste, "Paste HTML\tCtrl-Shift-V", "Pastes HTML in a wikipad compatable format"),)
# Custom lists conversion
numeral_map = zip(
(1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1),
('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
)
def intToRoman(i):
"""
Converts integer to roman numeral
"""
result = []
for integer, numeral in numeral_map:
count = int(i / integer)
result.append(numeral * count)
i -= integer * count
return ''.join(result)
letters = "0ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def intToLetter(i):
"""
Converts integer to letter format
Will work up to ZZ (26*26)
"""
if i > 26:
f = letters[i // 26]
s = letters[i % 26]
return "".join([f, s])
else:
return letters[i]
class HTMLStripper(HTMLParser):
"""
Strips all HTML tags other than those defined in
tags_to_keep
"""
def __init__(self):
self.reset()
self.fed = []
self.spacer = spacer
self.tags_to_keep = tags_to_keep
self.tags_to_sub = tags_to_sub
self.lists_format = lists_format
self.list_structure = []
self.list_numbers = []
# Is it realistic to handle tables within tables?
self.in_table = 0
self.new_table_row = True
self.last_tag = None
def handle_starttag(self, tag, attrs):
"""
Called whenever an opening tag is reach
Generally we add the wikipad equivilent to the
list self.fed
"""
self.last_tag = tag
if tag == u"img":
self.handle_image(attrs)
return
# If we're in a table check table formatting first
if tag in format_table:
if tag == table_start:
self.in_table += 1
if tag == u"tr":
self.new_table_row = True
if self.in_table < 1: # Table not started
if open_tables_if_needed and tag in \
table_start_tags: # Start table
self.fed.append(format_table[table_start][0])
self.in_table = 1
self.handle_table_tag(tag)
elif ignore_table_formatting_when_not_in_table:
return
else:
self.handle_table_tag(tag)
if tag in self.tags_to_keep:
self.fed.append(u"<{0}>".format(tag))
return
if tag in self.tags_to_sub:
# Parse anchors start tag
if tag == u"a":
if maintain_links:
link = self.get_attribute(attrs, u"href", u"Unable to find link")
self.fed.append("".join([self.tags_to_sub[tag][0], link, wikipad_link_delimiter]))
return
self.fed.append(self.tags_to_sub[tag][0])
# Handle lists
if tag in self.lists_format:
list_type = self.get_attribute(attrs, u"type")
self.list_structure.append((self.lists_format[tag], list_type))
self.list_numbers.append(0)
if tag == u"li" and len(self.list_structure) > 0:
list_item_number = self.list_numbers[-1]+1
self.list_numbers[-1] = list_item_number
list_start = self.list_structure[-1][0]
if list_start == u"ordered": # Ordered list
n = list_item_number
if allow_extra_ol_formats:
if self.list_structure[-1][1] == u"A": # Capital letters
n = intToLetter(list_item_number)
if self.list_structure[-1][1] == u"a": # Lower letters
n = intToLetter(list_item_number).lower()
if self.list_structure[-1][1] == u"I": # Roman letters
n = intToRoman(list_item_number)
if self.list_structure[-1][1] == u"i": # Roman lower letters
n = intToRoman(list_item_number).lower()
list_start = u"{0}.".format(n)
self.fed.append(u"{0}{1} ".format(self.spacer*len(self.list_structure), list_start))
def handle_startendtag(self, tag, attrs):
"""
Self closing tags are handled here
e.g. <img scr="...." />
<br />
For now just redirects to handle_starttag()
Are their any situations in which this shouldn't
happen?
"""
self.handle_starttag(tag, attrs)
def handle_endtag(self, tag):
"""
Called as the tag is closed
"""
if ignore_empty_tags:
if tag == self.last_tag and tag not in allowed_empty_tags:
del self.fed[-1]
self.last_tag = None
return
if tag in format_table:
if tag == u"caption":
self.fed.append(format_table[table_start][0])
return
if tag == table_start:
self.in_table -= 1
self.fed.append(format_table[tag][1])
if tag == u"a":
if maintain_links:
self.fed.append(self.tags_to_sub[tag][1])
return
if tag in self.tags_to_keep:
self.fed.append(u"</{0}>".format(tag))
return
if tag in self.tags_to_sub:
self.fed.append(self.tags_to_sub[tag][1])
if tag == u"li" and len(self.list_structure) > 0:
self.fed.append(u"\n")
if tag in self.lists_format:
del self.list_structure[-1]
def handle_data(self, d):
self.last_tag = None
self.fed.append(d)
def get_data(self):
if close_open_tables:
while self.in_table > 0:
self.fed.append(format_table[table_start][1])
self.in_table -= 1
return u''.join(self.fed)
def handle_table_tag(self, tag):
# Captions need to be inserted before the table start
if tag == u"caption":
# Caption should always be immediatly after table start
# so we can just delete the last item, add the caption
# and start the table again on its close tag
del self.fed[-1]
return
# Special case is needed for td
if tag == u"td" or tag == u"th":
if not self.new_table_row:
# need spaces or blank cell will be ignored
self.fed.append(u" {0} ".format(format_table[tag][0]))
return
else:
self.new_table_row = False
return
else:
self.fed.append(format_table[tag][0])
def handle_image(self, attrs):
if add_image_src:
appendix = ""
if maintain_image_resizes:
width = self.get_attribute(attrs, "width", False)
height = self.get_attribute(attrs, "height", False)
# Currently only deals with img for which both width and height are
# specified
if width and height:
size_type = "s" # default size in pixels
# HTML can handle images with 1 dimension in %
# and the other in pixel but wikidpad (default
# parser at least) cannot.
if width[-1] == "%":
size_type = "r"
appendix = "".join([wikipad_url_appendix_delimiter, size_type,
width, "x", height, " "])
self.fed.append("".join([self.get_attribute(attrs, "src"), appendix]))
def get_attribute(self, attrs, attr, not_found=""):
"""
Loops through all attributes returning the requested one
if found.
attrs is a list of turples, quotations (") are not included
[(name, value), (name2, value2), ...]
"""
if len(attrs) > 0:
for name, value in attrs:
if name == attr:
return value
return not_found
def strip_tags(html):
s = HTMLStripper()
s.feed(html)
return s.get_data()
def getData(d):
# Remove line breaks?
d = d.replace("\n", "")
if pre_blocks_maintain_formating:
# Pre blocks have to be handled before we remove whitespace
pre_blocks = []
a = re.search(r"<pre(?:>| [\s\S]*?>)([\s\S]*?)</pre(?:>| [\s\S]*?>)", d)
while a:
d = d.replace(a.group(0), "$_PREBLOCK-{0}_$".format(len(pre_blocks)))
pre_blocks.append(a.group(1))
a = re.search(r"<pre(?:>| [\s\S]*?>)([\s\S]{1,}?)</pre(?:>| [\s\S]*?>)", d)
# Remove whitespace (ignored in html anyway)
d = " ".join(d.split())
d = strip_tags(d)
# Clean up some spaces
d = d.replace(" ", " ")
if pre_blocks_maintain_formating:
# Add pre blocks back in
for i in range(len(pre_blocks)):
d = d.replace("$_PREBLOCK-{0}_$".format(i), "{0}{1}{2}".format(pre_block_tags[0], pre_blocks[i], pre_block_tags[1]))
# Perform custom replaces (if enabled)
if enable_custom_replace:
if len(custom_replace) > 0:
for a, b in custom_replace:
d = d.replace(a, b)
if len(custom_replace_regex) > 0:
for a, b in custom_replace_regex:
d = re.sub(a, b, d)
return d.lstrip()
def Paste(pwiki, evt):
editor = pwiki.getActiveEditor()
clipboard = gtk.Clipboard()
targets = clipboard.wait_for_targets()
if "text/html" in targets:
contents = clipboard.wait_for_contents("text/html")
if contents:
# Firefox data needs to be formated first
if "text/_moz_htmlinfo" in targets:
d = contents.data.decode('utf_16').replace('\x00', '').strip()
else:
d = contents.data.strip()
text = getData(d)
editor.ReplaceSelection(text)
elif "TEXT" in targets:
text_contents = clipboard.wait_for_contents("TEXT")
text = text_contents.data
editor.ReplaceSelection(text)
Attachments
-
PasteHTML.py
(26.6 kB) - added by xkjq
14 months ago.
PasteHTML v0.3
