HEX

File: //usr/local/share/namazu/pl/codeconv.pl
#
# -*- Perl -*-
# $Id: codeconv.pl,v 1.11.8.17 2009-01-29 07:55:49 opengl2772 Exp $
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000-2009 Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
#
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#
# package for code conversion
#
#   imported from  Rei FURUKAWA <furukawa@dkv.yamaha.co.jp> san's pnamazu.
#   [1998-09-24]

package codeconv;
use strict;

my @ktoe = (0xA3, 0xD6, 0xD7, 0xA2, 0xA6, 0xF2, 0xA1, 0xA3,
	     0xA5, 0xA7, 0xA9, 0xE3, 0xE5, 0xE7, 0xC3, 0xBC,
	     0xA2, 0xA4, 0xA6, 0xA8, 0xAA, 0xAB, 0xAD, 0xAF,
	     0xB1, 0xB3, 0xB5, 0xB7, 0xB9, 0xBB, 0xBD, 0xBF,
	     0xC1, 0xC4, 0xC6, 0xC8, 0xCA, 0xCB, 0xCC, 0xCD,
	     0xCE, 0xCF, 0xD2, 0xD5, 0xD8, 0xDB, 0xDE, 0xDF,
	     0xE0, 0xE1, 0xE2, 0xE4, 0xE6, 0xE8, 0xE9, 0xEA,
	     0xEB, 0xEC, 0xED, 0xEF, 0xF3, 0xAB, 0xAC, );

# convert JIS X0201 KANA characters to JIS X0208 KANA
sub ktoe ($$) {
    my ($c1, $c2) = @_;
    $c1 = ord($c1) & 0x7f;
    my($hi) = ($c1 <= 0x25 || $c1 == 0x30 || 0x5e <= $c1)? "\xa1": "\xa5";
    $c1 -= 0x21;
    my($lo) = $ktoe[$c1];
    if ($c2) {
        if ($c1 == 5) {
            $lo = 0xdd;
        } else {
            $lo++;
            $lo++ if (ord($c2) == 0xdf);
        }
    }
    return $hi . chr($lo);
}

sub eucjp_han2zen_kana ($) {
    my ($strref) = @_;
    if (util::islang("ja")) {
        $$strref =~ s/\x8e([\xa1-\xdf])(\x8e([\xde\xdf]))?/&ktoe($1,$3)/geo;
    }
}

# convert Shift_JIS to EUC-JP
sub stoe ($) {
    my ($c1, $c2) = unpack('CC', shift);

    if (0xa1 <= $c1 && $c1 <= 0xdf) {
        $c2 = $c1;
        $c1 = 0x8e;
    } elsif (0x9f <= $c2) {
        $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe0 : 0x60);
        $c2 += 2;
    } else {
        $c1 = $c1 * 2 - ($c1 >= 0xe0 ? 0xe1 : 0x61);
        $c2 += 0x60 + ($c2 < 0x7f);
    }

    # Outside of the range of an EUC-JP code.
    return chr(0xa2) . chr(0xae)
        if ($c1 < 0x80 || $c1 >= 0x100 || $c2 < 0x80 || $c2 >= 0x100);

    return chr($c1) . chr($c2);
}

sub shiftjis_to_eucjp ($) {
    my ($str) = @_;
    if (util::islang("ja")) {
        $str =~ s/([\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]|[\xa1-\xdf])/&stoe($1)/geo;
    }
    return $str;
}

# convert EUC-JP to Shift_JIS
sub etos ($) {
    my ($c1, $c2) = unpack('CC', shift);

    if ($c1 == 0x8e) {      # JIS X 0201 KATAKANA
        return chr($c2);
    } elsif ($c1 == 0x8f) { # JIS X 0212 HOJO KANJI
        return "\x81\xac";
    } elsif ($c1 % 2) {
        $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x31 : 0x71);
        $c2 -= 0x60 + ($c2 < 0xe0);
    } else {
        $c1 = ($c1>>1) + ($c1 < 0xdf ? 0x30 : 0x70);
        $c2 -= 2;
    }
    return chr($c1) . chr($c2);
}

sub eucjp_to_shiftjis ($) {
    my ($str) = @_;
    if (util::islang("ja")) {
        $str =~ s/([\xa1-\xfe][\xa1-\xfe]|\x8e[\xa1-\xdf]|\x8f[\xa1-\xfe][\xa1-\xfe])/&etos($1)/ge;
    }
    return $str;
}

# Remove a garbage EUC-JP 1st charactor at the end.
sub chomp_eucjp ($) {
    my ($str) = @_;
    if (util::islang("ja")) {
        if ($str =~ /\x8f$/ or $str =~ tr/\x8e\xa1-\xfe// % 2) {
            chop($str);
            chop($str) if ($str =~ /\x8f$/);
        }
    }
    return $str;
}

# convert to EUC-JP by using NKF
sub toeuc ($) {
    my ($contref, $opt) = @_;

    if (util::islang("ja")) {
        my $nkf_opt = "-emXZ1";

        if ($var::USE_NKF_MODULE) {
            $$contref = NKF::nkf($nkf_opt, $$contref); # namazu-devel-ja #3152 -> backed out, #3181
        } else {
            my $nkftmp = util::tmpnam("NMZ.nkf");
            {
                my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
                print $nh $$contref;
                util::fclose($nh);
            }
            {
                my $nh = util::efopen("< $nkftmp");
                $$contref = util::readfile($nh);
                util::fclose($nh);
            }
            unlink($nkftmp);
        }
    }
}

sub eucjp_zen2han_ascii ($) {
    my ($strref) = @_;
    if (util::islang("ja")) {
        $$strref =~ s/([\xa1-\xfe][\xa1-\xfe]|\x8e[\xa1-\xdf]|\x8f[\xa1-\xfe][\xa1-\xfe])/
        my $tmp = $1;
        if ($tmp =~ m!\xa3([\xb0-\xb9\xc1-\xda\xe1-\xfa])!) {
            $tmp = $1 & "\x7F";
        } elsif ($tmp =~ m!\xa1([\xa0-\xfe])!) {
            my $kigou = (
                # X0208 kigou conversion table
                # 0xa1a0 - 0xa1fe
                "\x00","\x20","\x00","\x00","\x2C","\x2E","\x00","\x3A",
                "\x3B","\x3F","\x21","\x00","\x00","\x27","\x60","\x00",
                "\x5E","\x00","\x5F","\x00","\x00","\x00","\x00","\x00",
                "\x00","\x00","\x00","\x00","\x00","\x2D","\x00","\x2F",
                "\x5C","\x00","\x00","\x7C","\x00","\x00","\x60","\x27",
                "\x22","\x22","\x28","\x29","\x00","\x00","\x5B","\x5D",
                "\x7B","\x7D","\x3C","\x3E","\x00","\x00","\x00","\x00",
                "\x00","\x00","\x00","\x00","\x2B","\x2D","\x00","\x00",
                "\x00","\x3D","\x00","\x3C","\x3E","\x00","\x00","\x00",
                "\x00","\x00","\x00","\x00","\x00","\x00","\x00","\x00",
                "\x24","\x00","\x00","\x25","\x23","\x26","\x2A","\x40",
                "\x00","\x00","\x00","\x00","\x00","\x00","\x00","\x00"
            )[unpack("C", $1) - unpack("C", "\xa0")];
            $tmp = $kigou unless ($kigou eq "\x00");
        }
        $tmp;
        /gse;
    }
}

sub normalize_eucjp ($) {
    my ($contref) = @_;
    if (util::islang("ja")) {
        codeconv::eucjp_han2zen_kana($contref);
        codeconv::eucjp_zen2han_ascii($contref);
    }
    $contref;
}

sub normalize_nl ($) {
    my ($conts) = @_;

    $$conts =~ s/\x0d\x0a/\x0a/g;  # Windows
    $$conts =~ s/\x0d/\x0a/g;      # Mac
    $$conts =~ s/\x0a/\n/g;
    $$conts;
}

sub remove_control_char ($) {
    my ($textref) = @_;
    $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
}

sub normalize_document ($) {
    my ($textref) = @_;
    codeconv::normalize_nl($textref);
    codeconv::remove_control_char($textref);
}

sub codeconv_document ($) {
    my ($textref) = @_;
    #codeconv::to_inner_encoding($textref, 'unknown');
    codeconv::toeuc($textref);
    codeconv::normalize_document($textref);
}

sub normalize_eucjp_document ($) {
    my ($textref) = @_;
    codeconv::normalize_eucjp($textref);
    codeconv::normalize_document($textref);
}

sub tousascii ($) {
    my ($contref) = @_;

    $$contref =~ s/[\x80-\xFF]/#/g;

    return $$contref;
}

1;