在这里。试试这个熟悉Unicode的wc程序版本。
-
它跳过非文件参数(管道、目录、套接字等)。
-
它采用UTF-8文本。
-
它将任何Unicode空白作为单词分隔符进行计数。
-
如果存在一个
.ENCODING
在文件名末尾,比如
foo.cp1252
,
foo.latin1
,
foo.utf16
等。
-
它还可以处理以各种格式压缩的文件。
-
它给出了
段落、线条、文字、字形、字符
和
字节。
-
它了解所有Unicode换行符序列。
-
它警告有换行错误的损坏的文本文件。
这里是一个运行它的例子:
   Paras    Lines    Words   Graphs    Chars    Bytes File
       2     2270    82249   504169   504333   528663 /tmp/ap
       1     2404    11163    63164    63164    66336 /tmp/b3
uwc: missing linebreak at end of corrupted textfiile /tmp/bad
      1*       2*        4       19       19       19 /tmp/bad
       1       14       52      273      273      293 /tmp/es
      57      383     1369    11997    11997    12001 /tmp/funny
       1   657068  3175429 31205970 31209138 32633834 /tmp/lw
       1        1        4       27       27       27 /tmp/nf.cp1252
       1        1        4       27       27       34 /tmp/nf.euc-jp
       1        1        4       27       27       27 /tmp/nf.latin1
       1        1        4       27       27       27 /tmp/nf.macroman
       1        1        4       27       27       54 /tmp/nf.ucs2
       1        1        4       27       27       56 /tmp/nf.utf16
       1        1        4       27       27       54 /tmp/nf.utf16be
       1        1        4       27       27       54 /tmp/nf.utf16le
       1        1        4       27       27      112 /tmp/nf.utf32
       1        1        4       27       27      108 /tmp/nf.utf32be
       1        1        4       27       27      108 /tmp/nf.utf32le
       1        1        4       27       27       39 /tmp/nf.utf7
       1        1        4       27       27       31 /tmp/nf.utf8
       1    26906   101528   635841   636026   661202 /tmp/o2
131Â Â Â Â Â Â 346Â Â Â Â Â 1370Â Â Â Â Â 9590Â Â Â Â Â 9590Â Â Â Â Â 4486Â /tmp/perl5122delta.pod.gz
291Â Â Â Â Â Â 814Â Â Â Â Â 3941Â Â Â Â 25318Â Â Â Â 25318Â Â Â Â Â 9878Â /tmp/perl51310delta.pod.bz2
       1     2551     5345   132655   132655   133178 /tmp/tailsort-pl.utf8
       1       89      334     1784     1784     2094 /tmp/til
       1        4       18       88       88      106 /tmp/w
     276     1736     5773    53782    53782    53804 /tmp/www
来吧:
#!/usr/bin/env perl
#########################################################################
# uniwc - improved version of wc that works correctly with Unicode
#
# Tom Christiansen <tchrist@perl.com>
# Mon Feb 28 15:59:01 MST 2011
#########################################################################
use 5.10.0;
use strict;
use warnings FATAL => "all";
use sigtrap qw[ die untrapped normal-signals ];
use Carp;
$SIG{__WARN__} = sub {
confess("FATALIZED WARNING: @_") unless $^S;
};
$SIG{__DIE__} = sub {
confess("UNCAUGHT EXCEPTION: @_") unless $^S;
};
$| = 1;
my $Errors = 0;
my $Headers = 0;
sub yuck($) {
my $errmsg = $_[0];
$errmsg =~ s/(?<=[^\n])\z/\n/;
print STDERR "$0: $errmsg";
}
process_input(\&countem);
sub countem {
my ($_, $file) = @_;
my (
@paras, @lines, @words,
$paracount, $linecount, $wordcount,
$grafcount, $charcount, $bytecount,
);
if ($charcount = length($_)) {
$wordcount = eval { @words = split m{ \p{Space}+ }x };
yuck "error splitting words: $@" if $@;
$linecount = eval { @lines = split m{ \R }x };
yuck "error splitting lines: $@" if $@;
$grafcount = 0;
$grafcount++ while /\X/g;
#$grafcount = eval { @lines = split m{ \R }x };
yuck "error splitting lines: $@" if $@;
$paracount = eval { @paras = split m{ \R{2,} }x };
yuck "error splitting paras: $@" if $@;
if ($linecount && !/\R\z/) {
yuck("missing linebreak at end of corrupted textfiile $file");
$linecount .= "*";
$paracount .= "*";
}
}
$bytecount = tell;
if (-e $file) {
$bytecount = -s $file;
if ($bytecount != -s $file) {
yuck "filesize of $file differs from bytecount\n";
$Errors++;
}
}
my $mask = "%8s " x 6 . "%s\n";
printf $mask => qw{ Paras Lines Words Graphs Chars Bytes File } unless $Headers++;
printf $mask => map( { show_undef($_) }
$paracount, $linecount,
$wordcount, $grafcount,
$charcount, $bytecount,
), $file;
}
sub show_undef {
my $value = shift;
return defined($value)
? $value
: "undef";
}
END {
close(STDOUT) || die "$0: can't close STDOUT: $!";
exit($Errors != 0);
}
sub process_input {
my $function = shift();
my $enc;
if (@ARGV == 0 && -t) {
warn "$0: reading from stdin, type ^D to end or ^C to kill.\n";
}
unshift(@ARGV, "-") if @ARGV == 0;
FILE:
for my $file (@ARGV) {
# don't let magic open make an output handle
next if -e $file && ! -f _;
my $quasi_filename = fix_extension($file);
$file = "standard input" if $file eq q(-);
$quasi_filename =~ s/^(?=\s*[>|])/< /;
no strict "refs";
my $fh = $file; # is *so* a lexical filehandle! âº
unless (open($fh, $quasi_filename)) {
yuck("couldn't open $quasi_filename: $!");
next FILE;
}
set_encoding($fh, $file) || next FILE;
my $whole_file = eval {
use warnings "FATAL" => "all";
local $/;
scalar <$fh>;
};
if ($@) {
$@ =~ s/ at \K.*? line \d+.*/$file line $./;
yuck($@);
next FILE;
}
$function->($whole_file, $file);
unless (close $fh) {
yuck("couldn't close $quasi_filename at line $.: $!");
next FILE;
}
} # foreach file
}
sub set_encoding(*$) {
my ($handle, $path) = @_;
my $enc_name = "utf8";
if ($path && $path =~ m{ \. ([^\s.]+) \z }x) {
my $ext = $1;
die unless defined $ext;
require Encode;
if (my $enc_obj = Encode::find_encoding($ext)) {
my $name = $enc_obj->name || $ext;
$enc_name = "encoding($name)";
}
}
return 1 if eval {
use warnings FATAL => "all";
no strict "refs";
binmode($handle, ":$enc_name");
1;
};
for ($@) {
s/ at .* line \d+\.//;
s/$/ for $path/;
}
yuck("set_encoding: $@");
return undef;
}
sub fix_extension {
my $path = shift();
my %Compress = (
Z => "zcat",
z => "gzcat", # for uncompressing
gz => "gzcat",
bz => "bzcat",
bz2 => "bzcat",
bzip => "bzcat",
bzip2 => "bzcat",
lzma => "lzcat",
);
if ($path =~ m{ \. ( [^.\s] +) \z }x) {
if (my $prog = $Compress{$1}) {
return "$prog $path |";
}
}
return $path;
}