HEX

File: //usr/local/share/namazu/pl/wakati.pl
#
# -*- Perl -*-
# $Id: wakati.pl,v 1.9.8.10 2009-01-28 17:54:57 opengl2772 Exp $
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000-2009 Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
#
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package wakati;
use strict;

# Do wakatigaki processing for a Japanese text.
sub wakatize_japanese ($) {
    my ($content) = @_;

    my @tmp = wakatize_japanese_sub($content);

    # Remove words consists of only Hiragana characters
    # when -H option is specified.
    # Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
    # [1997-11-13]
    # And do Okurigana processing. [1998-04-24]
    if ($var::Opt{'hiragana'} || $var::Opt{'okurigana'}) {
        for (my $ndx = 0; $ndx <= $#tmp; ++$ndx) {
	    $tmp[$ndx] =~ s/(\s)/ $1/g;
	    $tmp[$ndx] = ' ' . $tmp[$ndx];
	    if ($var::Opt{'okurigana'}) {
		$tmp[$ndx] =~ s/((?:[^\xa4][\xa1-\xfe])+)(?:\xa4[\xa1-\xf3])+ /$1 /g;
	    }
	    if ($var::Opt{'hiragana'}) {
		$tmp[$ndx] =~ s/ (?:\xa4[\xa1-\xf3])+ //g;
	    }
        }
    }

    # Collect only noun words when -m option is specified.
    if ($var::Opt{'noun'}) {
	$$content = "";
	$$content .= shift(@tmp) =~ /(.+ )\xcc\xbe\xbb\xec/ ? $1 : "" while @tmp;
	# noun (meisi) in Japanese "cc be bb ec"
    } else {
	$$content = join("\n", @tmp);
    }
    $$content =~ s/^\s+//gm;
    $$content =~ s/\s+$//gm;
    $$content =~ s/ +/ /gm;
    $$content .= "\n";
    util::dprint(_("-- wakatized content --\n")."$$content\n");
}

sub wakatize_japanese_sub ($) {
    my ($content) = @_;
    my $str = "";
    my @tmp = ();

    if ($conf::WAKATI =~ /^module_(\w+)/) {
	my $module = $1;
	if ($module eq "kakasi") {
	    $str = $$content;
	    $str =~ s/([\x80-\xff]+)/{my $text = Text::Kakasi::do_kakasi($1); " $text ";}/ge;
	} elsif ($module eq "chasen") {
            if ($var::Opt{'noun'}) {
	        $str = Text::ChaSen::sparse_tostr_long($$content);
            } else {
                $str = $$content;
	        $str =~ s/([\x80-\xff]+)/{my $text = Text::ChaSen::sparse_tostr_long($1); " $text ";}/ge;
            }
	} elsif ($module eq "mecab") {
	    use vars qw($t);
	    if (!defined $t) {
		require MeCab;
		import MeCab;
                eval '$t = new MeCab::Tagger("-Owakati");' or
                    $t = new MeCab::Tagger([qw(mecab -O wakati)]);
	    }
	    END {
		$t->DESTROY() if defined $t;
	    };
            $str = $$content;
            $str =~ s/([\x80-\xff]+)/{my $s = $1; my $text = $t->parse($s); " $text ";}/ge;
	} else {
	    util::cdie(_("invalid wakati module: ")."$module\n");
	}
        util::dprint(_("-- wakatized bare content --\n")."$str\n\n");
	@tmp = split('\n', $str);
    } else {
	my $tmpfile = util::tmpnam("NMZ.wakati");
        util::dprint(_("wakati: using ")."$conf::WAKATI\n");
	# Don't use IPC::Open2 because it's not efficent.
        if ($var::Opt{'noun'}) {
            my $fh_wakati = util::efopen("|$conf::WAKATI > $tmpfile");
            print $fh_wakati $$content;
            util::fclose($fh_wakati);
        } else {
            $str = $$content;

            my $redirect = ">";
            while(1) {
                if ($str =~ s/^([\x80-\xff]+)//s) {
                    my $fh_wakati = util::efopen("|$conf::WAKATI $redirect $tmpfile");
                    print $fh_wakati " $1\n";
                    util::fclose($fh_wakati);
                } elsif ($str =~ s/^([\x00-\x7f]+)//s) {
                    my $fh_wakati = util::efopen("$redirect $tmpfile");
                    print $fh_wakati " $1 ";
                    util::fclose($fh_wakati);
                } else {
                    last;
                }

                $redirect = ">>";
            }
	}
	{
	    my $fh_wakati = util::efopen($tmpfile);
	    @tmp = <$fh_wakati>;
	    chomp @tmp;
            util::fclose($fh_wakati);
	}
	unlink $tmpfile;
    }

    return @tmp;
}

1;