[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding

Front page | perl.cvs.parrot | Postings from January 2009
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding

From:
simon
Date:
January 24, 2009 05:07
Subject:
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding
Message ID:
20090124130708.64DA8CB9AE@x12.develooper.com
Author: simon
Date: Sat Jan 24 05:07:07 2009
New Revision: 35957

Added:
   branches/strings/pseudocode/ParrotEncoding/
   branches/strings/pseudocode/ParrotEncoding/Base.pm
   branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm
   branches/strings/pseudocode/ParrotEncoding/UTF8.pm
Modified:
   branches/strings/pseudocode/Encodings.pm

Log:
Rearrange things a bit, having all the encodings in one file was a bit unwieldy


Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm	(original)
+++ branches/strings/pseudocode/Encodings.pm	Sat Jan 24 05:07:07 2009
@@ -1,183 +1,10 @@
-class ParrotEncoding::Base::Fixed {
-    our $.width;
-    method setup($str) { }
-    method string_length($str) { return $str.strlen / $str.encoding.width; }
+use ParrotEncoding::Base;
 
-    method string_char_iterate($str, $callback, $parameter) {
-        for (0..self.string_length($str)-1) { 
-            $callback(self.char_at_index($str,$_), $parameter); 
-        }
-    }
-
-    # We assume in the base case that grapheme==char, which is true for
-    # legacy, non-Unicode fixed width formats. Unicode fixed width
-    # formats that care about graphemes can override.
-   
-    method grapheme_at_index($str, $index) { 
-        return [ self.char_at_index($str, $index) ]; 
-    }
-    method string_grapheme_iterate($str, $callback, $parameter) {
-        for (0..self.string_length($str)-1) { 
-            $callback($str.encoding.grapheme_at_index($str,$_), $parameter); 
-        }
-    }
-
-    method chopn_inplace($str, $n) { $str.strlen -= $n * $.width }
-}
-
-class ParrotEncoding::Base::Variable {
-    method setup($str) { }
-    method string_length($str) {
-        # This code written funny to be a bit more C-like
-        my $data = 0; 
-        my $callback = sub ($char, $data is rw) { $data++ };
-        $str.encoding.string_char_iterate($str, $callback, $data);
-        return $data;
-    }
-}
-
-class ParrotEncoding::UTF8 is ParrotEncoding::Base::Variable {  
-    sub _skip($c) {
-        if $c <= 191 { return 1 }
-        if 191 < $c < 224 { return 2 }
-        return 3
-    }
-    sub _bytes_needed($c) {
-        if $c < 0x80 { return 1 }
-        if $c < 0x0800 { return 2 }
-        return 3;
-    }
-    sub char_at_byteoffset ($str, $offset is rw) { # Private helper
-        if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" };
-        my $c = $str.buffer.[$offset++];
-        if 191 < $c < 224 {
-            if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
-            $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
-        } elsif $c >= 224 {
-            if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
-            $c = (($c +& 15) +< 12) 
-                +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
-            $c  +|= $str.buffer.[$offset++] +& 63;
-        }
-        return $c;
-    }
-
-    method append_char($str, $c) {
-        $str.bufused += _bytes_needed($c);
-        $str.strlen  += _bytes_needed($c);
-        if ($c < 0x80) {
-            push $str.buffer, $c;
-        } elsif ($c < 0x0800) {
-            push $str.buffer, $c +> 6 +| 0xc0;
-            push $str.buffer, $c +& 0x3f +| 0x80;
-        } else {
-            push $str.buffer, $c +> 12 +| 0xe0;
-            push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
-            push $str.buffer, $c +& 0x3f +| 0x80;
-        }
-    }
-
-    method append_grapheme($str, $g) { 
-        for (@($g)) { self.append_char($str, $_) }
-    }
-
-    method string_char_iterate ($str, $callback, $parameter) {
-        my $index = 0;
-        while ($index < $str.bufused) {
-            $callback(char_at_byteoffset($str, $index), $parameter);
-        }
-    }
-
-    method string_grapheme_iterate ($str, $callback, $parameter) {
-        if ($str.charset !~~ ParrotCharset::Unicode) {
-            # Although why you'd store non-Unicode in UTF8 is beyond me
-            my $to_unicode = sub ($c, $subcallback) {
-                $subcallback.[0].( [ $str.charset.to_unicode($c) ], $subcallback.[1]);
-            };
-            self.string_char_iterate($str, $to_unicode, [ $callback, $parameter ]);
-        }
-        # Collect characters into graphemes in a roughly O(n) way...
-        my $index = 0;
-        while ($index < $str.bufused) {
-            my $c = char_at_byteoffset($str, $index);
-
-            # If we're the last character, do the callback and give up
-            if ($index >= $str.bufused) { $callback([$c], $parameter); return; }
-
-            # At this point we know there is at least one following character.
-            # How many of them are combining?
-            my @grapheme = ( $c );
-            my $next_char;
-            my $nc_index = $index;
-            my $end_of_grapheme_sequence = $index;
-            while ($nc_index < $str.bufused and
-                   $next_char = char_at_byteoffset($str, $nc_index)
-                   and ParrotCharset::Unicode::is_combining($next_char)) {
-                   $end_of_grapheme_sequence = $nc_index;
-                   push @grapheme, @([ $next_char ]); # Work around horrible push/copy bug
-           }
-           $callback([@grapheme], $parameter);
-           $index = $end_of_grapheme_sequence;
-        }
-    }
-
-    # We're not going to cache this because if it's worth caching it's
-    # worth converting to a Parrot native string rather than keeping
-    # UTF8. We'll keep it dumb and working and people can optimise later
-    method char_at_index($str, $index) { 
-        my $i = $index + 0; # work around Rakudo assignment weirdness
-        my $offset = 0;
-        while $i-- > 0 { $offset += _skip($str.buffer[$offset])  }
-        return char_at_byteoffset($str, $offset);
-    }
-};
 class ParrotEncoding::UTF16  {  };
 class ParrotEncoding::UTF32  {  };
 class ParrotEncoding::EBCDIC {  };
-
-class ParrotEncoding::ParrotNative is ParrotEncoding::Base::Fixed {
-    our $.width = 1;
-
-    method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
-    method append_grapheme ($str, $g) {
-        if (@($g) > 1) {
-            my $item;
-            $item = $str.normalization.get_grapheme_table_entry(@($g));
-            $str.buffer.push($item);
-        } else {
-            $str.buffer.push(@( $g ));
-        }
-        $str.bufused++;
-        $str.strlen++;
-    }
-
-    method string_char_iterate ($str, $callback, $parameter) {
-        for (0..$str.bufused-1) { 
-            my $grapheme = grapheme_at_index($str, $_);
-            for (@( $grapheme )) {
-                $callback($str.buffer.[$_], $parameter); 
-            }
-        }
-    }
-
-    method char_at_index($str, $index) { 
-        # We need to look inside each grapheme, since NFG stores individual
-        # graphemes and graphemes are composed of multiple characters - 
-        # this could be improved with caching later but we will 
-        ...
-    }
-
-    method grapheme_at_index($str, $index) {
-        if (!$str.normalization) { 
-            $str.charset.normalize($str, ParrotNormalization::NFG);
-        }
-        my $c = $str.buffer[$index];
-        if $c >= 0 { return [ $c ]; }
-        return $str.normalization.grapheme_table.[-$c - 1];
-        # We are allowed to be pally with the normalization internals
-        # because NFG is specific to ParrotEncoding.
-    }
-};
+use ParrotEncoding::UTF8;
+use ParrotEncoding::ParrotNative;
 
 class ParrotEncoding::Byte is ParrotEncoding::Base::Fixed {
     our $.width = 1;

Added: branches/strings/pseudocode/ParrotEncoding/Base.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/Base.pm	Sat Jan 24 05:07:07 2009
@@ -0,0 +1,37 @@
+class ParrotEncoding::Base::Fixed {
+    our $.width;
+    method setup($str) { }
+    method string_length($str) { return $str.strlen / $str.encoding.width; }
+
+    method string_char_iterate($str, $callback, $parameter) {
+        for (0..self.string_length($str)-1) { 
+            $callback(self.char_at_index($str,$_), $parameter); 
+        }
+    }
+
+    # We assume in the base case that grapheme==char, which is true for
+    # legacy, non-Unicode fixed width formats. Unicode fixed width
+    # formats that care about graphemes can override.
+   
+    method grapheme_at_index($str, $index) { 
+        return [ self.char_at_index($str, $index) ]; 
+    }
+    method string_grapheme_iterate($str, $callback, $parameter) {
+        for (0..self.string_length($str)-1) { 
+            $callback($str.encoding.grapheme_at_index($str,$_), $parameter); 
+        }
+    }
+
+    method chopn_inplace($str, $n) { $str.strlen -= $n * $.width }
+}
+
+class ParrotEncoding::Base::Variable {
+    method setup($str) { }
+    method string_length($str) {
+        # This code written funny to be a bit more C-like
+        my $data = 0; 
+        my $callback = sub ($char, $data is rw) { $data++ };
+        $str.encoding.string_char_iterate($str, $callback, $data);
+        return $data;
+    }
+}

Added: branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/ParrotNative.pm	Sat Jan 24 05:07:07 2009
@@ -0,0 +1,43 @@
+class ParrotEncoding::ParrotNative is ParrotEncoding::Base::Fixed {
+    our $.width = 1;
+
+    method setup($str) { $str.normalization = ParrotNormalization::NFG.new(); }
+    method append_grapheme ($str, $g) {
+        if (@($g) > 1) {
+            my $item;
+            $item = $str.normalization.get_grapheme_table_entry(@($g));
+            $str.buffer.push($item);
+        } else {
+            $str.buffer.push(@( $g ));
+        }
+        $str.bufused++;
+        $str.strlen++;
+    }
+
+    method string_char_iterate ($str, $callback, $parameter) {
+        for (0..$str.bufused-1) { 
+            my $grapheme = grapheme_at_index($str, $_);
+            for (@( $grapheme )) {
+                $callback($str.buffer.[$_], $parameter); 
+            }
+        }
+    }
+
+    method char_at_index($str, $index) { 
+        # We need to look inside each grapheme, since NFG stores individual
+        # graphemes and graphemes are composed of multiple characters - 
+        # this could be improved with caching later but we will 
+        ...
+    }
+
+    method grapheme_at_index($str, $index) {
+        if (!$str.normalization) { 
+            $str.charset.normalize($str, ParrotNormalization::NFG);
+        }
+        my $c = $str.buffer[$index];
+        if $c >= 0 { return [ $c ]; }
+        return $str.normalization.grapheme_table.[-$c - 1];
+        # We are allowed to be pally with the normalization internals
+        # because NFG is specific to ParrotEncoding.
+    }
+};

Added: branches/strings/pseudocode/ParrotEncoding/UTF8.pm
==============================================================================
--- (empty file)
+++ branches/strings/pseudocode/ParrotEncoding/UTF8.pm	Sat Jan 24 05:07:07 2009
@@ -0,0 +1,95 @@
+class ParrotEncoding::UTF8 is ParrotEncoding::Base::Variable {  
+    sub _skip($c) {
+        if $c <= 191 { return 1 }
+        if 191 < $c < 224 { return 2 }
+        return 3
+    }
+    sub _bytes_needed($c) {
+        if $c < 0x80 { return 1 }
+        if $c < 0x0800 { return 2 }
+        return 3;
+    }
+    sub char_at_byteoffset ($str, $offset is rw) { # Private helper
+        if ($offset > $str.strlen) { Parrot_debug_string($str); die "BUG: Asked for a byte "~$offset~" that's not there" };
+        my $c = $str.buffer.[$offset++];
+        if 191 < $c < 224 {
+            if ($offset + 1 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
+            $c = (($c +& 31) +< 6) +| ( $str.buffer.[$offset++] +& 63 );
+        } elsif $c >= 224 {
+            if ($offset + 2 > $str.strlen) { die "BUG: UTF8 encoding ran off end of string" }
+            $c = (($c +& 15) +< 12) 
+                +| (( $str.buffer.[$offset++] +& 63 ) +< 6);
+            $c  +|= $str.buffer.[$offset++] +& 63;
+        }
+        return $c;
+    }
+
+    method append_char($str, $c) {
+        $str.bufused += _bytes_needed($c);
+        $str.strlen  += _bytes_needed($c);
+        if ($c < 0x80) {
+            push $str.buffer, $c;
+        } elsif ($c < 0x0800) {
+            push $str.buffer, $c +> 6 +| 0xc0;
+            push $str.buffer, $c +& 0x3f +| 0x80;
+        } else {
+            push $str.buffer, $c +> 12 +| 0xe0;
+            push $str.buffer, $c +> 6 +& 0x3f +| 0x80;
+            push $str.buffer, $c +& 0x3f +| 0x80;
+        }
+    }
+
+    method append_grapheme($str, $g) { 
+        for (@($g)) { self.append_char($str, $_) }
+    }
+
+    method string_char_iterate ($str, $callback, $parameter) {
+        my $index = 0;
+        while ($index < $str.bufused) {
+            $callback(char_at_byteoffset($str, $index), $parameter);
+        }
+    }
+
+    method string_grapheme_iterate ($str, $callback, $parameter) {
+        if ($str.charset !~~ ParrotCharset::Unicode) {
+            # Although why you'd store non-Unicode in UTF8 is beyond me
+            my $to_unicode = sub ($c, $subcallback) {
+                $subcallback.[0].( [ $str.charset.to_unicode($c) ], $subcallback.[1]);
+            };
+            self.string_char_iterate($str, $to_unicode, [ $callback, $parameter ]);
+        }
+        # Collect characters into graphemes in a roughly O(n) way...
+        my $index = 0;
+        while ($index < $str.bufused) {
+            my $c = char_at_byteoffset($str, $index);
+
+            # If we're the last character, do the callback and give up
+            if ($index >= $str.bufused) { $callback([$c], $parameter); return; }
+
+            # At this point we know there is at least one following character.
+            # How many of them are combining?
+            my @grapheme = ( $c );
+            my $next_char;
+            my $nc_index = $index;
+            my $end_of_grapheme_sequence = $index;
+            while ($nc_index < $str.bufused and
+                   $next_char = char_at_byteoffset($str, $nc_index)
+                   and ParrotCharset::Unicode::is_combining($next_char)) {
+                   $end_of_grapheme_sequence = $nc_index;
+                   push @grapheme, @([ $next_char ]); # Work around horrible push/copy bug
+           }
+           $callback([@grapheme], $parameter);
+           $index = $end_of_grapheme_sequence;
+        }
+    }
+
+    # We're not going to cache this because if it's worth caching it's
+    # worth converting to a Parrot native string rather than keeping
+    # UTF8. We'll keep it dumb and working and people can optimise later
+    method char_at_index($str, $index) { 
+        my $i = $index + 0; # work around Rakudo assignment weirdness
+        my $offset = 0;
+        while $i-- > 0 { $offset += _skip($str.buffer[$offset])  }
+        return char_at_byteoffset($str, $offset);
+    }
+};
[svn:parrot] r35957 - in branches/strings/pseudocode: . ParrotEncoding by simon
nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About