Front page | perl.cvs.parrot |
Postings from January 2009
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding
From:
simon
Date:
January 24, 2009 05:07
Subject:
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding
Message ID:
20090124130708.64DA8CB9AE@x12.develooper.com
Author: simon
Date: Sat Jan 24 05:07:07 2009
New Revision: 35957
Added:
branches/strings/pseudocode/ParrotEncoding/
branches/strings/pseudocode/ParrotEncoding/Base.pm
branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm
branches/strings/pseudocode/ParrotEncoding/UTF8.pm
Modified:
branches/strings/pseudocode/Encodings.pm
Log:
Rearrange things a bit, having all the encodings in one file was a bit unwieldy
Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm (original)
+++ branches/strings/pseudocode/Encodings.pm Sat Jan 24 05:07:07 2009
@@ -1,183 +1,10 @@
-class ParrotEncoding::Base::Fixed {
- our $.width;
- method setup($str) { }
- method string_length($str) { return $str.strlen / $str.encoding.width; }
+use ParrotEncoding::Base;
- method string_char_iterate($str, $callback, $parameter) {
- for (0..self.string_length($str)-1) {
- $callback(self.char_at_index($str,$_), $parameter);
- }
- }
-
- # We assume in the base case that grapheme==char, which is true for
- # legacy, non-Unicode fixed width formats. Unicode fixed width
- # formats that care about graphemes can override.
-
- method grapheme_at_index($str, $index) {
- return [ self.char_at_index($str, $index) ];
- }
- method string_grapheme_iterate($str, $callback, $parameter) {
- for (0..self.string_length($str)-1) {
- $callback($str.encoding.grapheme_at_index($str,$_), $parameter);
- }
- }
-
- method chopn_inplace($str, $n) { $str.strlen -= $n * $.width }
-}
-
-class ParrotEncoding::Base::Variable {
- method setup($str) { }
- method string_length($str) {
- # This code written funny to be a bit more C-like
- my $data = 0;
- my $callback = sub ($char, $data is rw) { $data++ };
- $str.encoding.string_char_iterate($str, $callback, $data);
- return $data;
- }
-}
-
-class ParrotEncoding::UTF8 is ParrotEncoding::Base::Variable {
- sub _skip($c) {
- if $c <= 191 { return 1 }
- if 191 < $c < 224 { return 2 }
- return 3
- }
- sub _bytes_needed($c) {
- if $c < 0x80 { return 1 }
- if $c < 0x0800 { return 2 }
- return 3;
- }
- sub char_at_byteoffset ($str, $offset is rw) { # Private helper
- if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" };
- my $c = $str.buffer.[$offset++];
- if 191 < $c < 224 {
- if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
- $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
- } elsif $c >= 224 {
- if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
- $c = (($c +& 15) +< 12)
- +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
- $c +|= $str.buffer.[$offset++] +& 63;
- }
- return $c;
- }
-
- method append_char($str, $c) {
- $str.bufused += _bytes_needed($c);
- $str.strlen += _bytes_needed($c);
- if ($c < 0x80) {
- push $str.buffer, $c;
- } elsif ($c < 0x0800) {
- push $str.buffer, $c +> 6 +| 0xc0;
- push $str.buffer, $c +& 0x3f +| 0x80;
- } else {
- push $str.buffer, $c +> 12 +| 0xe0;
- push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
- push $str.buffer, $c +& 0x3f +| 0x80;
- }
- }
-
- method append_grapheme($str, $g) {
- for (@($g)) { self.append_char($str, $_) }
- }
-
- method string_char_iterate ($str, $callback, $parameter) {
- my $index = 0;
- while ($index < $str.bufused) {
- $callback(char_at_byteoffset($str, $index), $parameter);
- }
- }
-
- method string_grapheme_iterate ($str, $callback, $parameter) {
- if ($str.charset !~~ ParrotCharset::Unicode) {
- # Although why you'd store non-Unicode in UTF8 is beyond me
- my $to_unicode = sub ($c, $subcallback) {
- $subcallback.[0].( [ $str.charset.to_unicode($c) ], $subcallback.[1]);
- };
- self.string_char_iterate($str, $to_unicode, [ $callback, $parameter ]);
- }
- # Collect characters into graphemes in a roughly O(n) way...
- my $index = 0;
- while ($index < $str.bufused) {
- my $c = char_at_byteoffset($str, $index);
-
- # If we're the last character, do the callback and give up
- if ($index >= $str.bufused) { $callback([$c], $parameter); return; }
-
- # At this point we know there is at least one following character.
- # How many of them are combining?
- my @grapheme = ( $c );
- my $next_char;
- my $nc_index = $index;
- my $end_of_grapheme_sequence = $index;
- while ($nc_index < $str.bufused and
- $next_char = char_at_byteoffset($str, $nc_index)
- and ParrotCharset::Unicode::is_combining($next_char)) {
- $end_of_grapheme_sequence = $nc_index;
- push @grapheme, @([ $next_char ]); # Work around horrible push/copy bug
- }
- $callback([@grapheme], $parameter);
- $index = $end_of_grapheme_sequence;
- }
- }
-
- # We're not going to cache this because if it's worth caching it's
- # worth converting to a Parrot native string rather than keeping
- # UTF8. We'll keep it dumb and working and people can optimise later
- method char_at_index($str, $index) {
- my $i = $index + 0; # work around Rakudo assignment weirdness
- my $offset = 0;
- while $i-- > 0 { $offset += _skip($str.buffer[$offset]) }
- return char_at_byteoffset($str, $offset);
- }
-};
class ParrotEncoding::UTF16 { };
class ParrotEncoding::UTF32 { };
class ParrotEncoding::EBCDIC { };
-
-class ParrotEncoding::ParrotNative is ParrotEncoding::Base::Fixed {
- our $.width = 1;
-
- method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
- method append_grapheme ($str, $g) {
- if (@($g) > 1) {
- my $item;
- $item = $str.normalization.get_grapheme_table_entry(@($g));
- $str.buffer.push($item);
- } else {
- $str.buffer.push(@( $g ));
- }
- $str.bufused++;
- $str.strlen++;
- }
-
- method string_char_iterate ($str, $callback, $parameter) {
- for (0..$str.bufused-1) {
- my $grapheme = grapheme_at_index($str, $_);
- for (@( $grapheme )) {
- $callback($str.buffer.[$_], $parameter);
- }
- }
- }
-
- method char_at_index($str, $index) {
- # We need to look inside each grapheme, since NFG stores individual
- # graphemes and graphemes are composed of multiple characters -
- # this could be improved with caching later but we will
- ...
- }
-
- method grapheme_at_index($str, $index) {
- if (!$str.normalization) {
- $str.charset.normalize($str, ParrotNormalization::NFG);
- }
- my $c = $str.buffer[$index];
- if $c >= 0 { return [ $c ]; }
- return $str.normalization.grapheme_table.[-$c - 1];
- # We are allowed to be pally with the normalization internals
- # because NFG is specific to ParrotEncoding.
- }
-};
+use ParrotEncoding::UTF8;
+use ParrotEncoding::ParrotNative;
class ParrotEncoding::Byte is ParrotEncoding::Base::Fixed {
our $.width = 1;
Added: branches/strings/pseudocode/ParrotEncoding/Base.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/Base.pm Sat Jan 24 05:07:07 2009
@@ -0,0 +1,37 @@
+class ParrotEncoding::Base::Fixed {
+ our $.width;
+ method setup($str) { }
+ method string_length($str) { return $str.strlen / $str.encoding.width; }
+
+ method string_char_iterate($str, $callback, $parameter) {
+ for (0..self.string_length($str)-1) {
+ $callback(self.char_at_index($str,$_), $parameter);
+ }
+ }
+
+ # We assume in the base case that grapheme==char, which is true for
+ # legacy, non-Unicode fixed width formats. Unicode fixed width
+ # formats that care about graphemes can override.
+
+ method grapheme_at_index($str, $index) {
+ return [ self.char_at_index($str, $index) ];
+ }
+ method string_grapheme_iterate($str, $callback, $parameter) {
+ for (0..self.string_length($str)-1) {
+ $callback($str.encoding.grapheme_at_index($str,$_), $parameter);
+ }
+ }
+
+ method chopn_inplace($str, $n) { $str.strlen -= $n * $.width }
+}
+
+class ParrotEncoding::Base::Variable {
+ method setup($str) { }
+ method string_length($str) {
+ # This code written funny to be a bit more C-like
+ my $data = 0;
+ my $callback = sub ($char, $data is rw) { $data++ };
+ $str.encoding.string_char_iterate($str, $callback, $data);
+ return $data;
+ }
+}
Added: branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm Sat Jan 24 05:07:07 2009
@@ -0,0 +1,43 @@
+class ParrotEncoding::ParrotNative is ParrotEncoding::Base::Fixed {
+ our $.width = 1;
+
+ method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
+ method append_grapheme ($str, $g) {
+ if (@($g) > 1) {
+ my $item;
+ $item = $str.normalization.get_grapheme_table_entry(@($g));
+ $str.buffer.push($item);
+ } else {
+ $str.buffer.push(@( $g ));
+ }
+ $str.bufused++;
+ $str.strlen++;
+ }
+
+ method string_char_iterate ($str, $callback, $parameter) {
+ for (0..$str.bufused-1) {
+ my $grapheme = grapheme_at_index($str, $_);
+ for (@( $grapheme )) {
+ $callback($str.buffer.[$_], $parameter);
+ }
+ }
+ }
+
+ method char_at_index($str, $index) {
+ # We need to look inside each grapheme, since NFG stores individual
+ # graphemes and graphemes are composed of multiple characters -
+ # this could be improved with caching later but we will
+ ...
+ }
+
+ method grapheme_at_index($str, $index) {
+ if (!$str.normalization) {
+ $str.charset.normalize($str, ParrotNormalization::NFG);
+ }
+ my $c = $str.buffer[$index];
+ if $c >= 0 { return [ $c ]; }
+ return $str.normalization.grapheme_table.[-$c - 1];
+ # We are allowed to be pally with the normalization internals
+ # because NFG is specific to ParrotEncoding.
+ }
+};
Added: branches/strings/pseudocode/ParrotEncoding/UTF8.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/UTF8.pm Sat Jan 24 05:07:07 2009
@@ -0,0 +1,95 @@
+class ParrotEncoding::UTF8 is ParrotEncoding::Base::Variable {
+ sub _skip($c) {
+ if $c <= 191 { return 1 }
+ if 191 < $c < 224 { return 2 }
+ return 3
+ }
+ sub _bytes_needed($c) {
+ if $c < 0x80 { return 1 }
+ if $c < 0x0800 { return 2 }
+ return 3;
+ }
+ sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+ if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" };
+ my $c = $str.buffer.[$offset++];
+ if 191 < $c < 224 {
+ if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
+ $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
+ } elsif $c >= 224 {
+ if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
+ $c = (($c +& 15) +< 12)
+ +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
+ $c +|= $str.buffer.[$offset++] +& 63;
+ }
+ return $c;
+ }
+
+ method append_char($str, $c) {
+ $str.bufused += _bytes_needed($c);
+ $str.strlen += _bytes_needed($c);
+ if ($c < 0x80) {
+ push $str.buffer, $c;
+ } elsif ($c < 0x0800) {
+ push $str.buffer, $c +> 6 +| 0xc0;
+ push $str.buffer, $c +& 0x3f +| 0x80;
+ } else {
+ push $str.buffer, $c +> 12 +| 0xe0;
+ push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
+ push $str.buffer, $c +& 0x3f +| 0x80;
+ }
+ }
+
+ method append_grapheme($str, $g) {
+ for (@($g)) { self.append_char($str, $_) }
+ }
+
+ method string_char_iterate ($str, $callback, $parameter) {
+ my $index = 0;
+ while ($index < $str.bufused) {
+ $callback(char_at_byteoffset($str, $index), $parameter);
+ }
+ }
+
+ method string_grapheme_iterate ($str, $callback, $parameter) {
+ if ($str.charset !~~ ParrotCharset::Unicode) {
+ # Although why you'd store non-Unicode in UTF8 is beyond me
+ my $to_unicode = sub ($c, $subcallback) {
+ $subcallback.[0].( [ $str.charset.to_unicode($c) ], $subcallback.[1]);
+ };
+ self.string_char_iterate($str, $to_unicode, [ $callback, $parameter ]);
+ }
+ # Collect characters into graphemes in a roughly O(n) way...
+ my $index = 0;
+ while ($index < $str.bufused) {
+ my $c = char_at_byteoffset($str, $index);
+
+ # If we're the last character, do the callback and give up
+ if ($index >= $str.bufused) { $callback([$c], $parameter); return; }
+
+ # At this point we know there is at least one following character.
+ # How many of them are combining?
+ my @grapheme = ( $c );
+ my $next_char;
+ my $nc_index = $index;
+ my $end_of_grapheme_sequence = $index;
+ while ($nc_index < $str.bufused and
+ $next_char = char_at_byteoffset($str, $nc_index)
+ and ParrotCharset::Unicode::is_combining($next_char)) {
+ $end_of_grapheme_sequence = $nc_index;
+ push @grapheme, @([ $next_char ]); # Work around horrible push/copy bug
+ }
+ $callback([@grapheme], $parameter);
+ $index = $end_of_grapheme_sequence;
+ }
+ }
+
+ # We're not going to cache this because if it's worth caching it's
+ # worth converting to a Parrot native string rather than keeping
+ # UTF8. We'll keep it dumb and working and people can optimise later
+ method char_at_index($str, $index) {
+ my $i = $index + 0; # work around Rakudo assignment weirdness
+ my $offset = 0;
+ while $i-- > 0 { $offset += _skip($str.buffer[$offset]) }
+ return char_at_byteoffset($str, $offset);
+ }
+};
-
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding
by simon