HEX

File: //usr/local/share/namazu/pl/htmlsplit.pl
#
# -*- Perl -*-
# $Id: htmlsplit.pl,v 1.9.4.6 2008-04-27 18:16:43 opengl2772 Exp $
#
# Copyright (C) 2000-2008 Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding

package htmlsplit;
require "util.pl";
#require "html.pl"; # don't need it because it sould be already loaded by load_filtermodules()

use strict;
use File::Copy;


my $Header = << 'EOS';
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
        "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<link rev=made href="mailto:${author}">
<title>${subject}</title>
</head>
<body>
<hr>
EOS

    my $Footer = << 'EOS';
<hr>
</body>
</html>
EOS

sub split ($$) {
    my ($fname, $base) = @_;

    my $mtime = (stat($fname))[9];
    my $cont = '';

    # for handling a filename which contains Shift_JIS code for Windows.
    # for handling a filename which contains including space.
    if (($fname =~ /\s/) ||
        ($English::OSNAME eq "MSWin32"
        && $fname =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/))
    {
        my $shelter_fname = $fname;
        $fname = util::tmpnam("NMZ.win32");
        unlink $fname if (-e $fname);
        copy($shelter_fname, $fname);

        $cont = util::readfile($fname);

        unlink $fname;
        $fname = $shelter_fname;
    } else {
        $cont = util::readfile($fname);
    }

    codeconv::codeconv_document(\$cont);

    my %info = (
		'title'    => get_title(\$cont),
		'author'   => get_author(\$cont),
		'anchored' => "",
		'name'     => "",
		'base'     => $base,
		'names'    => [],
		);

    # <http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2>
    # 
    # In certain cases, authors may specify the value of an attribute
    # without any quotation marks. The attribute value may only contain
    # letters (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), and
    # periods (ASCII decimal 46). We recommend using quotation marks even
    # when it is possible to eliminate them.

    my $id = 0;
#    $cont =~ s/(<a\s[^>]*href=(["']))#(.+?)(\2[^>]*>)/$1$3.html$4/gi; #'
    $cont =~ s#(<a[^>]*\s+)name=(["'])\2([^>]*>(.*?)</a>)#$1$4#sgi; #'
    $cont =~ s {
                \G(.+?)                                      # 1
	        (<h([1-6])>)?\s*                             # 2, 3
                <a[^>]*\s+name=([a-zA-Z0-9-\.]+|             # 4,
                (["']).+?\5)[^>]*>(.*?)</a>                  # 5,6
                \s*(</h\3>)?                                 # 7
             } {
                write_partial_file($1, $4, $6, $id++, $mtime, \%info)
             }sgexi;
    write_partial_file($cont, "", "", $id, $mtime, \%info);

    return @{$info{'names'}};
}

sub get_title ($) {
    my ($contref) = @_;
    my $title = undef;
    
    if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) {
	$title = $1;
	$title =~ s/\s+/ /g;
	$title =~ s/^\s+//;
	$title =~ s/\s+$//;
    } else {
	$title = "no title";
    }

    return $title;
}

sub get_author ($) {
    my ($contref) = @_;

    my $author = "unknown";

    # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">

    if ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\'])mailto:(.*?)\1\s*>!i) { #"
	$author = $2;
    } elsif ($$contref =~ m!.*<ADDRESS[^>]*>([^<]*?)</ADDRESS>!i) {
	my $tmp = $1;
	if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {
	    $author = $1;
	}
    }
    return $author;
}

sub write_partial_file($$$$$$) {
    my ($cont, $name, $anchored, $id, $mtime, $info_ref) = @_;

    $name =~ s/^([\"\'])(.*)\1$/$2/;  # Remove quotation marks.

    my $author        = $info_ref->{'author'};
    my $base          = $info_ref->{'base'};
    my $orig_title    = $info_ref->{'title'};
    my $prev_name     = $info_ref->{'name'};
    my $prev_anchored = $info_ref->{'anchored'};

    $prev_name        =~ s#\n\r##sg;
    $prev_name        =~ s#\n##sg;

    html::remove_html_elements(\$prev_anchored);
    $prev_anchored =~ s/^\s+//;
    $prev_anchored =~ s/\s+$//;
    my $title = $orig_title;

    # FIXME: I don't know why this processing causes "Use of
    # uninitialized value" warning if use $prev_anchored or
    # $prev_name directly. perl's bug?
    if ($prev_anchored ne "") {
	$title .= ": $prev_anchored";
    } elsif ($prev_name ne "") {
	$title .= ": $prev_name";
    }

    my $fname = util::tmpnam("$base.$id");
    my $fh = util::efopen(">$fname");
    my $header = $Header;
    $header =~ s/\$\{subject\}/$title/g;
    $header =~ s/\$\{author\}/$author/g;
    print $fh $header;
    print $fh $cont;

    my $footer = $Footer;
    print $fh $footer;

    push @{$info_ref->{'names'}}, $prev_name;
    $info_ref->{'anchored'} = $anchored;
    $info_ref->{'name'} = $name;

    # FIXME: Actually we don't need this. 
    #        But some perl versions need this.
    util::fclose($fh);
    utime($mtime, $mtime, $fname);

    return "";
}

1;