[svn:parrot] r35600 - branches/strings/pseudocode

Front page | perl.cvs.parrot | Postings from January 2009

[svn:parrot] r35600 - branches/strings/pseudocode

From:

simon

Date:

January 15, 2009 11:12

Subject:

[svn:parrot] r35600 - branches/strings/pseudocode

Message ID:

20090115191155.5076CCB9AE@x12.develooper.com

Author: simon
Date: Thu Jan 15 11:11:54 2009
New Revision: 35600

Modified:
   branches/strings/pseudocode/Charsets.pm
   branches/strings/pseudocode/Encodings.pm

Log:
This is the first part of NFG normalization support.


Modified: branches/strings/pseudocode/Charsets.pm
==============================================================================
--- branches/strings/pseudocode/Charsets.pm	(original)
+++ branches/strings/pseudocode/Charsets.pm	Thu Jan 15 11:11:54 2009
@@ -1,5 +1,69 @@
-class ParrotCharset::Unicode {  };
-class ParrotCharset::ASCII   {  };
+class ParrotCharset::Base {
+    method normalize($str, $nf) { return } # Normalization is a Unicode thing
+}
+
+class ParrotCharset::Unicode { 
+
+    sub is_combining($x) {
+        return 0x0300 <= $x <= 0x36f    || 0x0483 <= $x <= 0x486    ||
+               0x0488 <= $x <= 0x489    || 0x135F <= $x <= 0x135f   ||
+               0x1DC0 <= $x <= 0x1dc3   || 0x20D0 <= $x <= 0x20eb   ||
+               0x3099 <= $x <= 0x309a   || 0xFE20 <= $x <= 0xfe23   ||
+               0x1D165 <= $x <= 0x1d169 || 0x1D16D <= $x <= 0x1d172 ||
+               0x1D17B <= $x <= 0x1d182 || 0x1D185 <= $x <= 0x1d18b ||
+               0x1D1AA <= $x <= 0x1d1ad || 0x1D242 <= $x <= 0x1D244
+    }
+
+    method to_unicode($c) { return $c }
+    method normalize($str, $nf) {
+        return if $str.normalization and $str.normalization ~~ $nf;
+        # You might think that normalization requires a complicated
+        # dispatch based on what form we currently have and what form we
+        # want, but it's easier than that: the forms are hierarchical;
+        # to go to NFC you have to go through NFD, and to go to NFG you
+        # have to go through NFC. So...
+        if (!$str.normalization or $str.normalization !~~ Normalization::NFD) {
+            # Everyone starts in fully decomposed form (NFD)
+            # This is code-heavy and we'll do it in ICU in Parrot.
+            # Here, we just pretend we've already done it.
+            $str.normalization = ParrotNormalization::NFD;
+            return if $nf ~~ ParrotNormalization::NFD;
+        }
+
+        # Then from there convert to fully composed form (NFC)
+        # Equally just pretend we've already done it.
+        $str.normalization = ParrotNormalization::NFC;
+        return if $str.normalization and $str.normalization ~~ $nf;
+
+        # Now we do the NFC->NFG conversion
+        my @nfc = $str.buffer;
+        $str.normalization = ParrotNormalization::NFG.new(); # Beware!
+        my @nfg;
+        my $len = $str.strlen;
+        my $offset = 0;
+        my $count = 0;
+        while $offset < $len {
+            # Find end of current grapheme sequence
+            my $e = 0+$offset; 
+            $e++ while $e+1 <= $len && is_combining($nfc[$e+1]);
+            # Current grapheme sequence runs from $offset to $e.
+            if ($e - $offset) == 0 { # Simple case first
+                push @nfg, $nfc[$offset];
+           } else {
+                push @nfg, $str.normalization.get_grapheme_table_entry(@nfg[$offset..$e], $e-$offset)
+           }
+           $offset = $e + 1;
+           $count++;
+        }
+        $str.buffer = @nfg;
+        $str.strlen = $count;
+        return if $str.normalization ~~ $nf;
+        die "We wanted "~$nf~" and we ended up with "~$str.normalization;
+    }
+};
+class ParrotCharset::ASCII   {
+    method to_unicode($c) { return $c }
+};
 class ParrotCharset::Binary  {  };
 class ParrotCharset::SJIS    {  };
 class ParrotCharset::EUCJP   {  };

Modified: branches/strings/pseudocode/Encodings.pm
==============================================================================
--- branches/strings/pseudocode/Encodings.pm	(original)
+++ branches/strings/pseudocode/Encodings.pm	Thu Jan 15 11:11:54 2009
@@ -50,8 +50,11 @@
     method char_at_index($str, $index) { return $str.buffer.[$index] }
 
     method grapheme_at_index($str, $index) {
-        $str.charset.normalize($str, ParrotNormalization::NFG);
-        return $str.buffer.[$index]
+        if (!$str.normalization) { 
+            $str.charset.normalize($str, ParrotNormalization::NFG);
+            return $str.buffer.[$index]
+        }
+        return $str.normalization.grapheme_at_index($str, $index);
     }
 };
 class ParrotEncoding::Byte is ParrotEncoding::ParrotNative; # Just a bit thinner

[svn:parrot] r35600 - branches/strings/pseudocode by simon

nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About