Front page | perl.cvs.parrot |
Postings from January 2009
[svn:parrot] r35582 - in branches/strings/pseudocode: . t
From:
simon
Date:
January 15, 2009 03:25
Subject:
[svn:parrot] r35582 - in branches/strings/pseudocode: . t
Message ID:
20090115112538.D24FACB9AE@x12.develooper.com
Author: simon
Date: Thu Jan 15 03:25:38 2009
New Revision: 35582
Modified:
branches/strings/pseudocode/ParrotString.pm
branches/strings/pseudocode/t/create.t
Log:
Another function or two done, plus the start of UTF8 support.
Modified: branches/strings/pseudocode/ParrotString.pm
==============================================================================
--- branches/strings/pseudocode/ParrotString.pm (original)
+++ branches/strings/pseudocode/ParrotString.pm Thu Jan 15 03:25:38 2009
@@ -17,16 +17,55 @@
class ParrotCharset::SJIS { };
class ParrotCharset::EUCJP { };
-class ParrotEncoding::UTF8 { };
+class ParrotEncoding::UTF8 {
+ sub _skip($c) {
+ if $c <= 191 { return 1 }
+ if 191 < $c < 224 { return 2 }
+ return 3
+ }
+ sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+ my $c = $str.buffer.[$offset++];
+ if 191 < $c < 224 {
+ # XXX Guard
+ $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
+ } elsif $c >= 224 {
+ # XXX Guard
+ $c = (($c +& 15) +< 12)
+ +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
+ $c +|= $str.buffer.[$offset++] +& 63;
+ }
+ return $c;
+ }
+ method string_char_iterate ($str, $callback, $parameter) {
+ my $index = 0;
+ while ($index < $str.bufused-1) {
+ $callback(char_at_byteoffset($str, $index), $parameter);
+ }
+ }
+
+ # We're not going to cache this because if it's worth caching it's
+ # worth converting to a Parrot native string rather than keeping
+ # UTF8. We'll keep it dumb and working and people can optimise later
+ method char_at_index($str, $index) {
+ my $i = $index + 0; # work around Rakudo assignment weirdness
+ my $offset = 0;
+ while $i-- > 0 { $offset += _skip($str.buffer[$offset]) }
+ return char_at_byteoffset($str, $offset);
+ }
+};
class ParrotEncoding::UTF16 { };
class ParrotEncoding::UTF32 { };
class ParrotEncoding::EBCDIC { };
class ParrotEncoding::Byte {
- method string_grapheme_iterate ($str, $callback, $parameter) {
- for (0..$str.bufused-1) {
- $callback($str.buffer.[$_], $parameter);
- }
+ method string_char_iterate ($str, $callback, $parameter) {
+ for (0..$str.bufused-1) { $callback($str.buffer.[$_], $parameter); }
}
+
+ method string_grapheme_iterate($str, $callback, $parameter) {
+ for (0..$str.bufused-1) { $callback($str.buffer.[$_], $parameter); }
+ }
+
+ method char_at_index($str, $index) { return $str.buffer.[$index] }
};
@@ -58,7 +97,7 @@
my $news = ParrotString.new();
$news.charset = $charset;
$news.encoding = $encoding;
- $news.buffer = split("", $s);
+ $news.buffer = map { ord $_ }, split("", $s);
$news.bufused = $news.strlen = $len || length($s);
return $news;
}
@@ -75,6 +114,13 @@
}
}
+sub Parrot_string_length($str) {
+ # This code written funny to be a bit more C-like
+ my $data = 0; my $callback = sub ($char, $data is rw) { $data++ };
+ $str.encoding.string_char_iterate($str, $callback, $data);
+ return $data;
+}
+
sub Parrot_string_grapheme_length($str) {
# This code written funny to be a bit more C-like
my $data = 0; my $callback = sub ($char, $data is rw) { $data++ };
@@ -84,7 +130,7 @@
sub Parrot_string_byte_length($str) { return $str.strlen }
-sub Parrot_string_index($str, $index) { ... }
+sub Parrot_string_index($str, $index) { return $str.encoding.char_at_index($str, $index) }
sub Parrot_string_grapheme_index($str, $index) { ... }
sub Parrot_string_find_substr($str, $substr) { ... }
@@ -97,4 +143,7 @@
sub Parrot_string_grapheme_replace($src, $offset, $len, $replacement) { ... }
sub Parrot_string_chopn($src, $count) { ... }
sub Parrot_string_chopn_inplace($src, $count) { ... }
-sub Parrot_string_grapheme_chopn($src, $count) { ... }
+
+sub Parrot_string_grapheme_chopn($src, $count) {
+ return Parrot_string_replace($src, Parrot_string_grapheme_length($src) - $count, $count, undef);
+}
Modified: branches/strings/pseudocode/t/create.t
==============================================================================
--- branches/strings/pseudocode/t/create.t (original)
+++ branches/strings/pseudocode/t/create.t Thu Jan 15 03:25:38 2009
@@ -1,8 +1,15 @@
use Test;
use ParrotString;
-plan 3;
+plan 8;
-my $str = Parrot_string_new_init("abcdef", 4, ParrotCharset::ASCII, ParrotEncoding::Byte);
+my $str = Parrot_string_new_init("flurble", 4, ParrotCharset::ASCII, ParrotEncoding::Byte);
ok($str.charset ~~ ParrotCharset::ASCII, "Charset set properly");
is(Parrot_string_grapheme_length($str), 4, "String length correct");
is(Parrot_string_byte_length($str), 4, "String length correct");
+is(Parrot_string_index($str, 1), ord("l"), "String indexing");
+
+$str = Parrot_string_new_init("\xce\xb3\xce\xb5\xce\xb9\xce\xac \xcf\x83\xce\xbf\xcf\x85 \xce\xba\xcf\x8c\xcf\x83\xce\xbc\xce\xbf\xcf\x82", 28, ParrotCharset::Unicode, ParrotEncoding::UTF8);
+ok($str.charset ~~ ParrotCharset::Unicode, "We're unicode");
+is(Parrot_string_byte_length($str), 28, "String byte length correct");
+is(Parrot_string_length($str), 15, "UTF8 char length correct");
+is(Parrot_string_index($str, 3), 0x3ac, "UTF8 string indexing");
-
[svn:parrot] r35582 - in branches/strings/pseudocode: . t
by simon