package xm::cdocs;
use strict;
use xm::o;
use xm::sub;

sub DESC
{"
 here we scan for CDOCs - up to now they will probably be completly mark_off'd,
 so this subroutine mark_on all of it and tries to figure out which of the
 angles [<>] could actually be some xml'ish markup. It will leave those and
 just mark_off the rest. Sure, here we use some heuristics and it is up to
 you to enhance the heuristics gradually over time.
"}

sub ARGS { return xm::o::args_stdin(@_,DESC); }

sub do_litnam
{
    my $in = shift;
    $in =~ s{\&26;} {\&amp\;}gs;
    $in =~ s{\<}    {\&lt\;}gs;
    $in =~ s{\>}    {\&gt\;}gs;
    $in =~ s{\"}    {\&quot\;}gs;
    return $in;
}

sub do_litstr
{
    my $in = shift;
    $in =~ s{\\([^\d\w])} { xm::sub::off($1) }gs; # handle backslash-escapes
    $in =~ s{\&26\;} {\&amp\;}gs;
    $in =~ s{\<}     {\&lt\;}gs;
    $in =~ s{\>}     {\&gt\;}gs;
    $in =~ s{\"}     {\&quot\;}gs;
    return $in;
}

# be smart about accepting comment text, the sub will try to recognize any
# <...>-sequence that looks like xml. If there are angles that do not look like
# xml-tag they'll be replaced with their resp. entity-refs. Even more, this code
# does also introduce the ref-markup.
sub do_ccc
{
    my $in = shift; 

    $in = xm::sub::on($in);
    $in =~ s{\&} {\&\#26\;}gs;

    $in =~ s{=> (\s+)(\S+)(\s|$)} {"&[;XREF&];".do_litnam($2)."&[;/XREF&];".$3}gmex;
    $in =~ s{=> \"((?:[^\\\"]|\\.)*)\"} {"&[;XREF&];".do_litstr($1)."&[;/XREF&];"}gsex;
    $in =~ s{=> \'((?:[^\\\']|\\.)*)\'} {"&[;XREF&];".do_litstr($1)."&[;/XREF&];"}gsex;
    
    $in =~ s{\<([^<>!=?]*)\>}        {&[;$1&];}gm; # find <*>-tags 
    $in =~ s{<([/!?])([^<>]*)>}      {&[;$1$2&];}gm; # find </*>-tags 
    $in =~ s{<(\w+\s+\w+=)([^<>]*)>} {&[;$1$2&];}gs; # find <* *=*>-tags, can span lines

    # assume all other angles to be normal text    
    $in =~ s{<} {&lt;}gs;
    $in =~ s{>} {&gt;}gs;

    $in =~ s{\&\[\;}  {\<}gs;
    $in =~ s{\&\]\;}  {\>}gs;
    
    return $in;
}

sub DO
{
    my $in = shift;

    $in =~ s{ (<CDOC>) ( (?:.(?!</?CDOC>))* .) (</CDOC>) }
    { $1.do_ccc($2).$3 }gsex;

    return $in;
}

1;

