Front page | perl.cvs.parrot |
Postings from January 2009
[svn:parrot] r35956 - in branches/strings/pseudocode: . t
From:
simon
Date:
January 24, 2009 04:36
Subject:
[svn:parrot] r35956 - in branches/strings/pseudocode: . t
Message ID:
20090124123631.87DD5CB9AE@x12.develooper.com
Author: simon
Date: Sat Jan 24 04:36:25 2009
New Revision: 35956
Added:
branches/strings/pseudocode/t/recode.t (contents, props changed)
Modified:
branches/strings/pseudocode/Encodings.pm
branches/strings/pseudocode/ParrotString.pm
branches/strings/pseudocode/t/create.t
Log:
Some bug fixes, and now we have UTF8->NFG->UTF8 round-tripping.
Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm (original)
+++ branches/strings/pseudocode/Encodings.pm Sat Jan 24 04:36:25 2009
@@ -42,25 +42,52 @@
if 191 < $c < 224 { return 2 }
return 3
}
+ sub _bytes_needed($c) {
+ if $c < 0x80 { return 1 }
+ if $c < 0x0800 { return 2 }
+ return 3;
+ }
sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+ if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" };
my $c = $str.buffer.[$offset++];
if 191 < $c < 224 {
- # XXX Guard
+ if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
$c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
} elsif $c >= 224 {
- # XXX Guard
+ if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
$c = (($c +& 15) +< 12)
+| (( $str.buffer.[$offset++] +& 63 ) +< 6);
$c +|= $str.buffer.[$offset++] +& 63;
}
return $c;
}
+
+ method append_char($str, $c) {
+ $str.bufused += _bytes_needed($c);
+ $str.strlen += _bytes_needed($c);
+ if ($c < 0x80) {
+ push $str.buffer, $c;
+ } elsif ($c < 0x0800) {
+ push $str.buffer, $c +> 6 +| 0xc0;
+ push $str.buffer, $c +& 0x3f +| 0x80;
+ } else {
+ push $str.buffer, $c +> 12 +| 0xe0;
+ push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
+ push $str.buffer, $c +& 0x3f +| 0x80;
+ }
+ }
+
+ method append_grapheme($str, $g) {
+ for (@($g)) { self.append_char($str, $_) }
+ }
+
method string_char_iterate ($str, $callback, $parameter) {
my $index = 0;
- while ($index < $str.bufused-1) {
+ while ($index < $str.bufused) {
$callback(char_at_byteoffset($str, $index), $parameter);
}
}
+
method string_grapheme_iterate ($str, $callback, $parameter) {
if ($str.charset !~~ ParrotCharset::Unicode) {
# Although why you'd store non-Unicode in UTF8 is beyond me
@@ -71,7 +98,7 @@
}
# Collect characters into graphemes in a roughly O(n) way...
my $index = 0;
- while ($index < $str.bufused-1) {
+ while ($index < $str.bufused) {
my $c = char_at_byteoffset($str, $index);
# If we're the last character, do the callback and give up
@@ -83,7 +110,7 @@
my $next_char;
my $nc_index = $index;
my $end_of_grapheme_sequence = $index;
- while ($nc_index <= $str.bufused and
+ while ($nc_index < $str.bufused and
$next_char = char_at_byteoffset($str, $nc_index)
and ParrotCharset::Unicode::is_combining($next_char)) {
$end_of_grapheme_sequence = $nc_index;
@@ -113,13 +140,13 @@
method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
method append_grapheme ($str, $g) {
- my $item;
if (@($g) > 1) {
+ my $item;
$item = $str.normalization.get_grapheme_table_entry(@($g));
+ $str.buffer.push($item);
} else {
- ($item) = @($g);
+ $str.buffer.push(@( $g ));
}
- $str.buffer.push($item);
$str.bufused++;
$str.strlen++;
}
@@ -146,7 +173,7 @@
}
my $c = $str.buffer[$index];
if $c >= 0 { return [ $c ]; }
- return $str.normalization.grapheme_table.[-$c];
+ return $str.normalization.grapheme_table.[-$c - 1];
# We are allowed to be pally with the normalization internals
# because NFG is specific to ParrotEncoding.
}
Modified: branches/strings/pseudocode/ParrotString.pm
==============================================================================
--- branches/strings/pseudocode/ParrotString.pm (original)
+++ branches/strings/pseudocode/ParrotString.pm Sat Jan 24 04:36:25 2009
@@ -100,3 +100,29 @@
sub Parrot_string_grapheme_chopn($src, $count) {
return Parrot_string_replace($src, Parrot_string_grapheme_length($src) - $count, $count, undef);
}
+
+sub Parrot_debug_string($src) {
+ say "String charset: "~$src.charset;
+ say "String encoding: "~$src.encoding;
+ say "String normalization: "~$src.normalization;
+ say "String buffer used: "~$src.bufused;
+ say "String length: "~$src.strlen;
+ say "String buffer contents: ";
+ for ( $src.buffer) { print " ["~$_~"]"; }
+ say "";
+}
+
+sub Parrot_string_byte_equal($one, $two) {
+ if ($one.strlen != $two.strlen) { return 0; }
+ for (0 .. $one.strlen-1) {
+ if ($one.buffer.[$_] != $two.buffer.[$_]) {
+ say "Oops, byte "~$_~" differed";
+ return 0
+ }
+ }
+ return 1;
+}
+sub Parrot_string_character_equal($one, $two) {
+ say "Not implemented yet";
+ return 0;
+}
Modified: branches/strings/pseudocode/t/create.t
==============================================================================
--- branches/strings/pseudocode/t/create.t (original)
+++ branches/strings/pseudocode/t/create.t Sat Jan 24 04:36:25 2009
@@ -1,6 +1,6 @@
use Test;
use ParrotString;
-plan 11;
+plan 10;
my $str = Parrot_string_new_init("flurble", 4, ParrotCharset::ASCII, ParrotEncoding::Byte);
ok($str.charset ~~ ParrotCharset::ASCII, "Charset set properly");
@@ -16,9 +16,3 @@
is(Parrot_string_byte_length($str), 28, "String byte length correct");
is(Parrot_string_length($str), 15, "UTF8 char length correct");
is(Parrot_string_index($str, 3), 0x3ac, "UTF8 string indexing");
-
-# The standard NFG example...
-$str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, ParrotCharset::Unicode, ParrotEncoding::UTF8);
-my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::ParrotNative);
-Parrot_string_grapheme_copy($str, $str2);
-is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme");
Added: branches/strings/pseudocode/t/recode.t
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/t/recode.t Sat Jan 24 04:36:25 2009
@@ -0,0 +1,15 @@
+use Test;
+use ParrotString;
+plan 4;
+
+# The standard NFG example...
+my $str = Parrot_string_new_init("ABC \xd0\xb8\xcc\x8f", 8, ParrotCharset::Unicode, ParrotEncoding::UTF8);
+my $str2 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::ParrotNative);
+Parrot_string_grapheme_copy($str, $str2);
+is(Parrot_string_grapheme_length($str2), 5, "Four UTF8 bytes = one grapheme");
+my $str3 = Parrot_string_new_init("", 0, ParrotCharset::Unicode, ParrotEncoding::UTF8);
+
+Parrot_string_grapheme_copy($str2, $str3);
+ok(Parrot_string_byte_equal($str, $str3), "Round-tripping UTF8" );
+ok(Parrot_string_character_equal($str, $str3), "Character equivalence for UTF8" );
+ok(Parrot_string_character_equal($str2, $str3), "Character equivalence between UTF8 and NFG" );
-
[svn:parrot] r35956 - in branches/strings/pseudocode: . t
by simon