415 lines
10 KiB
Perl
415 lines
10 KiB
Perl
package markup;
|
|
|
|
use strict;
|
|
use warnings;
|
|
no warnings 'redefine';
|
|
|
|
use Data::Dumper;
|
|
use Text::WikiCreole();
|
|
use HTML::Parse();
|
|
use HTML::FormatText();
|
|
use Encode();
|
|
use HTML::Entities();
|
|
use Text::Markdown();
|
|
|
|
use log();
|
|
|
|
our @EXPORT_OK =
|
|
qw(fix_line_ends html_to_creole creole_to_html creole_to_plain plain_to_ical ical_to_plain ical_to_xml html_to_plain fix_utf8 uri_encode compress base26);
|
|
|
|
sub fix_line_ends ($) {
|
|
my ($s) = @_;
|
|
$s =~ s/\r?\n|\r/\n/g;
|
|
return $s;
|
|
}
|
|
|
|
# convert 1..26 to a..z, 27 to aa, inspired by ConvertAA
|
|
sub base26($) {
|
|
my ($num) = @_;
|
|
return '' if $num <= 0;
|
|
|
|
my $s = "";
|
|
while ($num) {
|
|
$s = chr( --$num % 26 + ord "a" ) . $s;
|
|
$num = int $num / 26;
|
|
}
|
|
|
|
return $s;
|
|
}
|
|
|
|
sub html_to_creole($) {
|
|
my ($s) = @_;
|
|
|
|
#remove elements
|
|
$s =~ s/\<\!\-\-[\s\S]*?\-\-\>//gi;
|
|
$s =~ s/<script.*?>.*?<\/script.*?>//gi;
|
|
$s =~ s/<\/?form.*?>//gi;
|
|
$s =~ s/<\/?select.*?>//gi;
|
|
$s =~ s/<\/?option.*?//gi;
|
|
$s =~ s/<\/?input.*?>//gi;
|
|
$s =~ s/<\/?script.*?>//gi;
|
|
|
|
#remove line breaks
|
|
$s =~ s/[\r\n]+/ /gi;
|
|
|
|
#formats
|
|
$s =~ s/<img.*?src="(.*?)".*?>/{{$1\|}}/gi;
|
|
$s =~ s/<img.*?title="(.*?)".*?>/{{$2\|$1}}/gi;
|
|
$s =~ s/<img.*?src="(.*?)"[^>]*?title="(.*?)".*?>/{{$1\|$2}}/gi;
|
|
$s =~ s/<img.*?title="(.*?)"[^>]*?src="(.*?)".*?>/{{$2\|$1}}/gi;
|
|
$s =~ s/<\/?img.*?>//gi;
|
|
|
|
#replace line breaks from images
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
|
|
$s =~ s/<i.*?>(.*?)<\/i>/\/\/$1\/\//gi;
|
|
$s =~ s/<\/?i.*?>//gi;
|
|
$s =~ s/<b.*?>(.*?)<\/b>/\*\*$1\*\*/gi;
|
|
|
|
$s =~ s/<strong.*?>(.*?)<\/strong>/\*\*$1\*\*/gi;
|
|
$s =~ s/<em.*?>(.*?)<\/em>/\/\/$1\/\//gi;
|
|
$s =~ s/<blockquote.*?>((\W+|\w+)*?)<\/blockquote>/{{{$1}}}/gi;
|
|
|
|
$s =~ s/<a\s+.*?href="(.*?)".*?>(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi;
|
|
$s =~ s/<a.*?>//gi;
|
|
|
|
#replace line breaks from links
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
|
|
$s =~ s/[\s]+/ /gi;
|
|
|
|
#line elements, increase head line level to avoid breaking single = chars
|
|
$s =~ s/\s*<h1.*?>/== /gi;
|
|
$s =~ s/\s*<h2.*?>/=== /gi;
|
|
$s =~ s/\s*<h3.*?>/==== /gi;
|
|
$s =~ s/\s*<h\d.*?>/===== /gi;
|
|
|
|
my $tree = HTML::Parse::parse_html( '<body>' . $s . '</body>' );
|
|
my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 2000 );
|
|
$s = $formatter->format($tree);
|
|
|
|
$s =~ s/\</\</g;
|
|
|
|
#fix line endings
|
|
$s =~ s/\n[ \t]+/\n/gi;
|
|
|
|
$s =~ s/\n{3,99}/\n\n/g;
|
|
$s =~ s/\n*\*[\s]+/\n\* /g;
|
|
|
|
#enter line break before headlines
|
|
$s =~ s/(={2,99})/\n$1/g;
|
|
|
|
#reduce head line level
|
|
$s =~ s/=(=+)/$1/g;
|
|
|
|
$s =~ s/^\s+//gi;
|
|
$s =~ s/\s+$//gi;
|
|
$s =~ s/\n{3,99}/\n\n/g;
|
|
|
|
$s =~ s/\n/\\\\\n/g;
|
|
$s =~ s/\\\\\n\=/\n\=/g;
|
|
|
|
return $s;
|
|
}
|
|
|
|
sub creole_to_html ($) {
|
|
my $s = $_[0] || '';
|
|
|
|
$s =~ s/<a\s+.*?href="(.*?)".*?>(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi;
|
|
$s =~ s/<a.*?>//gi;
|
|
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
$s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
|
$s =~ s/^\s+//g;
|
|
$s =~ s/\s+$//g;
|
|
|
|
$s = Text::WikiCreole::creole_parse($s) || '';
|
|
|
|
#replace line breaks from images
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
$s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
|
|
|
#remove whitespaces and break lines at start or end of elements
|
|
for my $elem ( 'p', 'li' ) {
|
|
$s =~ s|<$elem>\s*<br/><br/>|<$elem>|g;
|
|
$s =~ s|<br/><br/>\s*</$elem>|</$elem>|g;
|
|
}
|
|
|
|
return $s;
|
|
}
|
|
|
|
sub markdown_to_html($){
|
|
my $text = $_[0] // '';
|
|
my $html = Text::Markdown::markdown($text);
|
|
$html=~s!(\s)(https?://[^\s]+)(\s)!$1<a href="$2">$2</a>$3!g;
|
|
return $html;
|
|
}
|
|
|
|
sub creole_to_plain($) {
|
|
my ($s) = @_;
|
|
|
|
$s =~ s/\<p\>/\n/gi;
|
|
$s =~ s/\{\{\{((\W+|\w+)+?)\}\}\}/<blockquote>$1<\/blockquote>/g;
|
|
$s =~ s/\{\{(.+?)\|(.*?)\}\}//g;
|
|
$s =~ s/\[\[(.+?)\|(.*?)\]\]/$2/g;
|
|
$s =~ s/\/\/([^\/\/]*?)\/\//<em>$1<\/em> /g;
|
|
$s =~ s/\n=== (.*?)\n/\n<h3>$1<\/h3>\n/g;
|
|
$s =~ s/\n== (.*?)\n/\n<h2>$1<\/h2>\n/g;
|
|
$s =~ s/\*\*(.*?)\*\*/<strong>$1<\/strong> /g;
|
|
$s =~ s/^== (.*?)\n/<h2>$1<\/h2>\n/g;
|
|
$s =~ s/\n\* (.*?)\n/\n<li>$1<\/li>\n/g;
|
|
$s =~ s/\n\* (.*?)\n/\n<li>$1<\/li>\n/g;
|
|
$s =~ s/\n\- (.*?)\n/\n<lo>$1<\/lo>\n/g;
|
|
$s =~ s/\n\- (.*?)\n/\n<lo>$1<\/lo>\n/g;
|
|
$s =~ s/\n\n/\n<p>/gi;
|
|
$s =~ s/\n/\n<br\/>/gi;
|
|
return $s;
|
|
}
|
|
|
|
sub html_to_plain ($) {
|
|
my ($s) = @_;
|
|
|
|
return '' unless defined $s;
|
|
my $tree = HTML::Parse::parse_html( '<body>' . $s . '</body>' );
|
|
my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 2000 );
|
|
$s = $formatter->format($tree);
|
|
return $s;
|
|
}
|
|
|
|
sub ical_to_plain ($) {
|
|
return '' unless defined( $_[0] );
|
|
$_[0] =~ s/\\n/\n/gi;
|
|
$_[0] =~ s/ /\t/gi;
|
|
$_[0] =~ s/\\\./\./gi;
|
|
$_[0] =~ s/\\\,/\,/gi;
|
|
$_[0] =~ s/\\\\/\\/gi;
|
|
return $_[0];
|
|
}
|
|
|
|
sub plain_to_ical ($) {
|
|
return '' unless defined( $_[0] );
|
|
|
|
#remove images + links
|
|
$_[0] =~ s/\[\[.+?\|(.+?)\]\]/$1/g;
|
|
$_[0] =~ s/\{\{.+?\}\}//g;
|
|
$_[0] =~ s/^\s+//g;
|
|
$_[0] =~ s/\\/\\\\/gi;
|
|
$_[0] =~ s/\,/\\\,/gi;
|
|
|
|
# $_[0]=~s/\./\\\./gi;
|
|
$_[0] =~ s/[\r\n]/\\n/gi;
|
|
$_[0] =~ s/\t/ /gi;
|
|
return $_[0];
|
|
}
|
|
|
|
sub plain_to_xml($) {
|
|
return '' unless defined( $_[0] );
|
|
$_[0] =~ s/\n\={1,6} (.*?)\s+/\n\[\[$1\]\]\n/gi;
|
|
|
|
#remove images + links
|
|
$_[0] =~ s/\[\[.+?\|(.+?)\]\]/$1/g;
|
|
$_[0] =~ s/\{\{.+?\}\}//g;
|
|
return encode_xml_element( $_[0] );
|
|
}
|
|
|
|
sub fix_utf8($) {
|
|
$_[0] = Encode::decode( 'cp1252', $_[0] );
|
|
return $_[0];
|
|
}
|
|
|
|
sub uri_encode ($) {
|
|
$_[0] =~ s/([^a-zA-Z0-9_\.\-])/sprintf("%%%02lx",ord($1))/esg;
|
|
return $_[0];
|
|
}
|
|
|
|
sub compress ($) {
|
|
my $header = '';
|
|
|
|
if ( $_[0] =~ /(Content\-type\:[^\n]+[\n]+)/ ) {
|
|
$header = $1;
|
|
}
|
|
my $start = index( $_[0], $header );
|
|
return if ( $start < 0 );
|
|
|
|
my $header_length = length($header);
|
|
$header = substr( $_[0], 0, $start + $header_length );
|
|
|
|
my $content = substr( $_[0], $start + $header_length );
|
|
|
|
#remove multiple line breaks
|
|
$content =~ s/[\r\n]+[\s]*[\r\n]+/\n/g;
|
|
|
|
#remove leading whitespaces
|
|
$content =~ s/[\r\n]+[\s]+/\n/g;
|
|
|
|
#remove tailing whitespaces
|
|
$content =~ s/[\t ]*[\r\n]+/\n/g;
|
|
|
|
#remove whitespaces inside tags
|
|
$content =~ s/([\n]\<[^\n]+)[\r\n]+/$1 /g;
|
|
$content =~ s/\"\s+\>/\"\>/g;
|
|
|
|
#get closing tags closer
|
|
$content =~ s/[\r\n]+(\<[\/\!])/$1/g;
|
|
$content =~ s/(\>)[\r\n]+([^\<])/$1$2/g;
|
|
|
|
#remove leading whitespaces
|
|
#$content=~s/[\r\n]+([\d\S])/$1/g;
|
|
|
|
#remove empty lines
|
|
$content =~ s/[\n\r]+/\n/g;
|
|
|
|
#remove whitespaces between tags
|
|
$content =~ s/\>[\t ]+\<(^\/T)/\>\<$1/g;
|
|
|
|
#multiple whitespaces
|
|
$content =~ s/[\t ]+/ /g;
|
|
|
|
#restore content-type line break
|
|
$_[0] = $header . $content;
|
|
|
|
#$_[0]=~s/HTTP_CONTENT_TYPE/\n\n/;
|
|
# return $_[0];
|
|
}
|
|
|
|
#from XML::RSS.pm
|
|
my %entity = (
|
|
nbsp => " ",
|
|
iexcl => "¡",
|
|
cent => "¢",
|
|
pound => "£",
|
|
curren => "¤",
|
|
yen => "¥",
|
|
brvbar => "¦",
|
|
sect => "§",
|
|
uml => "¨",
|
|
copy => "©",
|
|
ordf => "ª",
|
|
laquo => "«",
|
|
not => "¬",
|
|
shy => "­",
|
|
reg => "®",
|
|
macr => "¯",
|
|
deg => "°",
|
|
plusmn => "±",
|
|
sup2 => "²",
|
|
sup3 => "³",
|
|
acute => "´",
|
|
micro => "µ",
|
|
para => "¶",
|
|
middot => "·",
|
|
cedil => "¸",
|
|
sup1 => "¹",
|
|
ordm => "º",
|
|
raquo => "»",
|
|
frac14 => "¼",
|
|
frac12 => "½",
|
|
frac34 => "¾",
|
|
iquest => "¿",
|
|
Agrave => "À",
|
|
Aacute => "Á",
|
|
Acirc => "Â",
|
|
Atilde => "Ã",
|
|
Auml => "Ä",
|
|
Aring => "Å",
|
|
AElig => "Æ",
|
|
Ccedil => "Ç",
|
|
Egrave => "È",
|
|
Eacute => "É",
|
|
Ecirc => "Ê",
|
|
Euml => "Ë",
|
|
Igrave => "Ì",
|
|
Iacute => "Í",
|
|
Icirc => "Î",
|
|
Iuml => "Ï",
|
|
ETH => "Ð",
|
|
Ntilde => "Ñ",
|
|
Ograve => "Ò",
|
|
Oacute => "Ó",
|
|
Ocirc => "Ô",
|
|
Otilde => "Õ",
|
|
Ouml => "Ö",
|
|
times => "×",
|
|
Oslash => "Ø",
|
|
Ugrave => "Ù",
|
|
Uacute => "Ú",
|
|
Ucirc => "Û",
|
|
Uuml => "Ü",
|
|
Yacute => "Ý",
|
|
THORN => "Þ",
|
|
szlig => "ß",
|
|
agrave => "à",
|
|
aacute => "á",
|
|
acirc => "â",
|
|
atilde => "ã",
|
|
auml => "ä",
|
|
aring => "å",
|
|
aelig => "æ",
|
|
ccedil => "ç",
|
|
egrave => "è",
|
|
eacute => "é",
|
|
ecirc => "ê",
|
|
euml => "ë",
|
|
igrave => "ì",
|
|
iacute => "í",
|
|
icirc => "î",
|
|
iuml => "ï",
|
|
eth => "ð",
|
|
ntilde => "ñ",
|
|
ograve => "ò",
|
|
oacute => "ó",
|
|
ocirc => "ô",
|
|
otilde => "õ",
|
|
ouml => "ö",
|
|
divide => "÷",
|
|
oslash => "ø",
|
|
ugrave => "ù",
|
|
uacute => "ú",
|
|
ucirc => "û",
|
|
uuml => "ü",
|
|
yacute => "ý",
|
|
thorn => "þ",
|
|
yuml => "ÿ",
|
|
);
|
|
|
|
my $entities = join( '|', keys %entity );
|
|
|
|
sub encode_xml_element($) {
|
|
my ($text) = @_;
|
|
|
|
my $encoded_text = '';
|
|
|
|
while ( $text =~ s/(.*?)(\<\!\[CDATA\[.*?\]\]\>)//s ) {
|
|
$encoded_text .= encode_xml_element_text($1) . $2;
|
|
}
|
|
$encoded_text .= encode_xml_element_text($text);
|
|
|
|
return $encoded_text;
|
|
}
|
|
|
|
sub encode_xml_element_text ($) {
|
|
my ($text) = @_;
|
|
|
|
$text =~ s/&(?!(#[0-9]+|#x[0-9a-fA-F]+|\w+);)/&/g;
|
|
$text =~ s/&($entities);/$entity{$1}/g;
|
|
$text =~ s/\</\<\;/g;
|
|
$text =~ s/\>/\>\;/g;
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub escapeHtml($) {
|
|
my ($s) = @_;
|
|
|
|
return HTML::Entities::encode_entities( $s, q{&<>"'} );
|
|
}
|
|
|
|
#do not delete last line!
|
|
1;
|