Dependency update, new golang.org/x package names

This commit is contained in:
Jakob Borg
2014-11-30 00:17:00 +01:00
parent 5dbaf6ceb0
commit 126c4e9a06
51 changed files with 7847 additions and 7911 deletions

View File

@@ -0,0 +1,35 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bcrypt
import "encoding/base64"
const alphabet = "./ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
var bcEncoding = base64.NewEncoding(alphabet)
func base64Encode(src []byte) []byte {
n := bcEncoding.EncodedLen(len(src))
dst := make([]byte, n)
bcEncoding.Encode(dst, src)
for dst[n-1] == '=' {
n--
}
return dst[:n]
}
func base64Decode(src []byte) ([]byte, error) {
numOfEquals := 4 - (len(src) % 4)
for i := 0; i < numOfEquals; i++ {
src = append(src, '=')
}
dst := make([]byte, bcEncoding.DecodedLen(len(src)))
n, err := bcEncoding.Decode(dst, src)
if err != nil {
return nil, err
}
return dst[:n], nil
}

View File

@@ -0,0 +1,294 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bcrypt implements Provos and Mazières's bcrypt adaptive hashing
// algorithm. See http://www.usenix.org/event/usenix99/provos/provos.pdf
package bcrypt
// The code is a port of Provos and Mazières's C implementation.
import (
"crypto/rand"
"crypto/subtle"
"errors"
"fmt"
"golang.org/x/crypto/blowfish"
"io"
"strconv"
)
const (
MinCost int = 4 // the minimum allowable cost as passed in to GenerateFromPassword
MaxCost int = 31 // the maximum allowable cost as passed in to GenerateFromPassword
DefaultCost int = 10 // the cost that will actually be set if a cost below MinCost is passed into GenerateFromPassword
)
// The error returned from CompareHashAndPassword when a password and hash do
// not match.
var ErrMismatchedHashAndPassword = errors.New("crypto/bcrypt: hashedPassword is not the hash of the given password")
// The error returned from CompareHashAndPassword when a hash is too short to
// be a bcrypt hash.
var ErrHashTooShort = errors.New("crypto/bcrypt: hashedSecret too short to be a bcrypted password")
// The error returned from CompareHashAndPassword when a hash was created with
// a bcrypt algorithm newer than this implementation.
type HashVersionTooNewError byte
func (hv HashVersionTooNewError) Error() string {
return fmt.Sprintf("crypto/bcrypt: bcrypt algorithm version '%c' requested is newer than current version '%c'", byte(hv), majorVersion)
}
// The error returned from CompareHashAndPassword when a hash starts with something other than '$'
type InvalidHashPrefixError byte
func (ih InvalidHashPrefixError) Error() string {
return fmt.Sprintf("crypto/bcrypt: bcrypt hashes must start with '$', but hashedSecret started with '%c'", byte(ih))
}
type InvalidCostError int
func (ic InvalidCostError) Error() string {
return fmt.Sprintf("crypto/bcrypt: cost %d is outside allowed range (%d,%d)", int(ic), int(MinCost), int(MaxCost))
}
const (
majorVersion = '2'
minorVersion = 'a'
maxSaltSize = 16
maxCryptedHashSize = 23
encodedSaltSize = 22
encodedHashSize = 31
minHashSize = 59
)
// magicCipherData is an IV for the 64 Blowfish encryption calls in
// bcrypt(). It's the string "OrpheanBeholderScryDoubt" in big-endian bytes.
var magicCipherData = []byte{
0x4f, 0x72, 0x70, 0x68,
0x65, 0x61, 0x6e, 0x42,
0x65, 0x68, 0x6f, 0x6c,
0x64, 0x65, 0x72, 0x53,
0x63, 0x72, 0x79, 0x44,
0x6f, 0x75, 0x62, 0x74,
}
type hashed struct {
hash []byte
salt []byte
cost int // allowed range is MinCost to MaxCost
major byte
minor byte
}
// GenerateFromPassword returns the bcrypt hash of the password at the given
// cost. If the cost given is less than MinCost, the cost will be set to
// DefaultCost, instead. Use CompareHashAndPassword, as defined in this package,
// to compare the returned hashed password with its cleartext version.
func GenerateFromPassword(password []byte, cost int) ([]byte, error) {
p, err := newFromPassword(password, cost)
if err != nil {
return nil, err
}
return p.Hash(), nil
}
// CompareHashAndPassword compares a bcrypt hashed password with its possible
// plaintext equivalent. Returns nil on success, or an error on failure.
func CompareHashAndPassword(hashedPassword, password []byte) error {
p, err := newFromHash(hashedPassword)
if err != nil {
return err
}
otherHash, err := bcrypt(password, p.cost, p.salt)
if err != nil {
return err
}
otherP := &hashed{otherHash, p.salt, p.cost, p.major, p.minor}
if subtle.ConstantTimeCompare(p.Hash(), otherP.Hash()) == 1 {
return nil
}
return ErrMismatchedHashAndPassword
}
// Cost returns the hashing cost used to create the given hashed
// password. When, in the future, the hashing cost of a password system needs
// to be increased in order to adjust for greater computational power, this
// function allows one to establish which passwords need to be updated.
func Cost(hashedPassword []byte) (int, error) {
p, err := newFromHash(hashedPassword)
if err != nil {
return 0, err
}
return p.cost, nil
}
func newFromPassword(password []byte, cost int) (*hashed, error) {
if cost < MinCost {
cost = DefaultCost
}
p := new(hashed)
p.major = majorVersion
p.minor = minorVersion
err := checkCost(cost)
if err != nil {
return nil, err
}
p.cost = cost
unencodedSalt := make([]byte, maxSaltSize)
_, err = io.ReadFull(rand.Reader, unencodedSalt)
if err != nil {
return nil, err
}
p.salt = base64Encode(unencodedSalt)
hash, err := bcrypt(password, p.cost, p.salt)
if err != nil {
return nil, err
}
p.hash = hash
return p, err
}
func newFromHash(hashedSecret []byte) (*hashed, error) {
if len(hashedSecret) < minHashSize {
return nil, ErrHashTooShort
}
p := new(hashed)
n, err := p.decodeVersion(hashedSecret)
if err != nil {
return nil, err
}
hashedSecret = hashedSecret[n:]
n, err = p.decodeCost(hashedSecret)
if err != nil {
return nil, err
}
hashedSecret = hashedSecret[n:]
// The "+2" is here because we'll have to append at most 2 '=' to the salt
// when base64 decoding it in expensiveBlowfishSetup().
p.salt = make([]byte, encodedSaltSize, encodedSaltSize+2)
copy(p.salt, hashedSecret[:encodedSaltSize])
hashedSecret = hashedSecret[encodedSaltSize:]
p.hash = make([]byte, len(hashedSecret))
copy(p.hash, hashedSecret)
return p, nil
}
func bcrypt(password []byte, cost int, salt []byte) ([]byte, error) {
cipherData := make([]byte, len(magicCipherData))
copy(cipherData, magicCipherData)
c, err := expensiveBlowfishSetup(password, uint32(cost), salt)
if err != nil {
return nil, err
}
for i := 0; i < 24; i += 8 {
for j := 0; j < 64; j++ {
c.Encrypt(cipherData[i:i+8], cipherData[i:i+8])
}
}
// Bug compatibility with C bcrypt implementations. We only encode 23 of
// the 24 bytes encrypted.
hsh := base64Encode(cipherData[:maxCryptedHashSize])
return hsh, nil
}
func expensiveBlowfishSetup(key []byte, cost uint32, salt []byte) (*blowfish.Cipher, error) {
csalt, err := base64Decode(salt)
if err != nil {
return nil, err
}
// Bug compatibility with C bcrypt implementations. They use the trailing
// NULL in the key string during expansion.
ckey := append(key, 0)
c, err := blowfish.NewSaltedCipher(ckey, csalt)
if err != nil {
return nil, err
}
var i, rounds uint64
rounds = 1 << cost
for i = 0; i < rounds; i++ {
blowfish.ExpandKey(ckey, c)
blowfish.ExpandKey(csalt, c)
}
return c, nil
}
func (p *hashed) Hash() []byte {
arr := make([]byte, 60)
arr[0] = '$'
arr[1] = p.major
n := 2
if p.minor != 0 {
arr[2] = p.minor
n = 3
}
arr[n] = '$'
n += 1
copy(arr[n:], []byte(fmt.Sprintf("%02d", p.cost)))
n += 2
arr[n] = '$'
n += 1
copy(arr[n:], p.salt)
n += encodedSaltSize
copy(arr[n:], p.hash)
n += encodedHashSize
return arr[:n]
}
func (p *hashed) decodeVersion(sbytes []byte) (int, error) {
if sbytes[0] != '$' {
return -1, InvalidHashPrefixError(sbytes[0])
}
if sbytes[1] > majorVersion {
return -1, HashVersionTooNewError(sbytes[1])
}
p.major = sbytes[1]
n := 3
if sbytes[2] != '$' {
p.minor = sbytes[2]
n++
}
return n, nil
}
// sbytes should begin where decodeVersion left off.
func (p *hashed) decodeCost(sbytes []byte) (int, error) {
cost, err := strconv.Atoi(string(sbytes[0:2]))
if err != nil {
return -1, err
}
err = checkCost(cost)
if err != nil {
return -1, err
}
p.cost = cost
return 3, nil
}
func (p *hashed) String() string {
return fmt.Sprintf("&{hash: %#v, salt: %#v, cost: %d, major: %c, minor: %c}", string(p.hash), p.salt, p.cost, p.major, p.minor)
}
func checkCost(cost int) error {
if cost < MinCost || cost > MaxCost {
return InvalidCostError(cost)
}
return nil
}

View File

@@ -0,0 +1,226 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bcrypt
import (
"bytes"
"fmt"
"testing"
)
func TestBcryptingIsEasy(t *testing.T) {
pass := []byte("mypassword")
hp, err := GenerateFromPassword(pass, 0)
if err != nil {
t.Fatalf("GenerateFromPassword error: %s", err)
}
if CompareHashAndPassword(hp, pass) != nil {
t.Errorf("%v should hash %s correctly", hp, pass)
}
notPass := "notthepass"
err = CompareHashAndPassword(hp, []byte(notPass))
if err != ErrMismatchedHashAndPassword {
t.Errorf("%v and %s should be mismatched", hp, notPass)
}
}
func TestBcryptingIsCorrect(t *testing.T) {
pass := []byte("allmine")
salt := []byte("XajjQvNhvvRt5GSeFk1xFe")
expectedHash := []byte("$2a$10$XajjQvNhvvRt5GSeFk1xFeyqRrsxkhBkUiQeg0dt.wU1qD4aFDcga")
hash, err := bcrypt(pass, 10, salt)
if err != nil {
t.Fatalf("bcrypt blew up: %v", err)
}
if !bytes.HasSuffix(expectedHash, hash) {
t.Errorf("%v should be the suffix of %v", hash, expectedHash)
}
h, err := newFromHash(expectedHash)
if err != nil {
t.Errorf("Unable to parse %s: %v", string(expectedHash), err)
}
// This is not the safe way to compare these hashes. We do this only for
// testing clarity. Use bcrypt.CompareHashAndPassword()
if err == nil && !bytes.Equal(expectedHash, h.Hash()) {
t.Errorf("Parsed hash %v should equal %v", h.Hash(), expectedHash)
}
}
func TestVeryShortPasswords(t *testing.T) {
key := []byte("k")
salt := []byte("XajjQvNhvvRt5GSeFk1xFe")
_, err := bcrypt(key, 10, salt)
if err != nil {
t.Errorf("One byte key resulted in error: %s", err)
}
}
func TestTooLongPasswordsWork(t *testing.T) {
salt := []byte("XajjQvNhvvRt5GSeFk1xFe")
// One byte over the usual 56 byte limit that blowfish has
tooLongPass := []byte("012345678901234567890123456789012345678901234567890123456")
tooLongExpected := []byte("$2a$10$XajjQvNhvvRt5GSeFk1xFe5l47dONXg781AmZtd869sO8zfsHuw7C")
hash, err := bcrypt(tooLongPass, 10, salt)
if err != nil {
t.Fatalf("bcrypt blew up on long password: %v", err)
}
if !bytes.HasSuffix(tooLongExpected, hash) {
t.Errorf("%v should be the suffix of %v", hash, tooLongExpected)
}
}
type InvalidHashTest struct {
err error
hash []byte
}
var invalidTests = []InvalidHashTest{
{ErrHashTooShort, []byte("$2a$10$fooo")},
{ErrHashTooShort, []byte("$2a")},
{HashVersionTooNewError('3'), []byte("$3a$10$sssssssssssssssssssssshhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")},
{InvalidHashPrefixError('%'), []byte("%2a$10$sssssssssssssssssssssshhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")},
{InvalidCostError(32), []byte("$2a$32$sssssssssssssssssssssshhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh")},
}
func TestInvalidHashErrors(t *testing.T) {
check := func(name string, expected, err error) {
if err == nil {
t.Errorf("%s: Should have returned an error", name)
}
if err != nil && err != expected {
t.Errorf("%s gave err %v but should have given %v", name, err, expected)
}
}
for _, iht := range invalidTests {
_, err := newFromHash(iht.hash)
check("newFromHash", iht.err, err)
err = CompareHashAndPassword(iht.hash, []byte("anything"))
check("CompareHashAndPassword", iht.err, err)
}
}
func TestUnpaddedBase64Encoding(t *testing.T) {
original := []byte{101, 201, 101, 75, 19, 227, 199, 20, 239, 236, 133, 32, 30, 109, 243, 30}
encodedOriginal := []byte("XajjQvNhvvRt5GSeFk1xFe")
encoded := base64Encode(original)
if !bytes.Equal(encodedOriginal, encoded) {
t.Errorf("Encoded %v should have equaled %v", encoded, encodedOriginal)
}
decoded, err := base64Decode(encodedOriginal)
if err != nil {
t.Fatalf("base64Decode blew up: %s", err)
}
if !bytes.Equal(decoded, original) {
t.Errorf("Decoded %v should have equaled %v", decoded, original)
}
}
func TestCost(t *testing.T) {
suffix := "XajjQvNhvvRt5GSeFk1xFe5l47dONXg781AmZtd869sO8zfsHuw7C"
for _, vers := range []string{"2a", "2"} {
for _, cost := range []int{4, 10} {
s := fmt.Sprintf("$%s$%02d$%s", vers, cost, suffix)
h := []byte(s)
actual, err := Cost(h)
if err != nil {
t.Errorf("Cost, error: %s", err)
continue
}
if actual != cost {
t.Errorf("Cost, expected: %d, actual: %d", cost, actual)
}
}
}
_, err := Cost([]byte("$a$a$" + suffix))
if err == nil {
t.Errorf("Cost, malformed but no error returned")
}
}
func TestCostValidationInHash(t *testing.T) {
if testing.Short() {
return
}
pass := []byte("mypassword")
for c := 0; c < MinCost; c++ {
p, _ := newFromPassword(pass, c)
if p.cost != DefaultCost {
t.Errorf("newFromPassword should default costs below %d to %d, but was %d", MinCost, DefaultCost, p.cost)
}
}
p, _ := newFromPassword(pass, 14)
if p.cost != 14 {
t.Errorf("newFromPassword should default cost to 14, but was %d", p.cost)
}
hp, _ := newFromHash(p.Hash())
if p.cost != hp.cost {
t.Errorf("newFromHash should maintain the cost at %d, but was %d", p.cost, hp.cost)
}
_, err := newFromPassword(pass, 32)
if err == nil {
t.Fatalf("newFromPassword: should return a cost error")
}
if err != InvalidCostError(32) {
t.Errorf("newFromPassword: should return cost error, got %#v", err)
}
}
func TestCostReturnsWithLeadingZeroes(t *testing.T) {
hp, _ := newFromPassword([]byte("abcdefgh"), 7)
cost := hp.Hash()[4:7]
expected := []byte("07$")
if !bytes.Equal(expected, cost) {
t.Errorf("single digit costs in hash should have leading zeros: was %v instead of %v", cost, expected)
}
}
func TestMinorNotRequired(t *testing.T) {
noMinorHash := []byte("$2$10$XajjQvNhvvRt5GSeFk1xFeyqRrsxkhBkUiQeg0dt.wU1qD4aFDcga")
h, err := newFromHash(noMinorHash)
if err != nil {
t.Fatalf("No minor hash blew up: %s", err)
}
if h.minor != 0 {
t.Errorf("Should leave minor version at 0, but was %d", h.minor)
}
if !bytes.Equal(noMinorHash, h.Hash()) {
t.Errorf("Should generate hash %v, but created %v", noMinorHash, h.Hash())
}
}
func BenchmarkEqual(b *testing.B) {
b.StopTimer()
passwd := []byte("somepasswordyoulike")
hash, _ := GenerateFromPassword(passwd, 10)
b.StartTimer()
for i := 0; i < b.N; i++ {
CompareHashAndPassword(hash, passwd)
}
}
func BenchmarkGeneration(b *testing.B) {
b.StopTimer()
passwd := []byte("mylongpassword1234")
b.StartTimer()
for i := 0; i < b.N; i++ {
GenerateFromPassword(passwd, 10)
}
}

View File

@@ -0,0 +1,159 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blowfish
// getNextWord returns the next big-endian uint32 value from the byte slice
// at the given position in a circular manner, updating the position.
func getNextWord(b []byte, pos *int) uint32 {
var w uint32
j := *pos
for i := 0; i < 4; i++ {
w = w<<8 | uint32(b[j])
j++
if j >= len(b) {
j = 0
}
}
*pos = j
return w
}
// ExpandKey performs a key expansion on the given *Cipher. Specifically, it
// performs the Blowfish algorithm's key schedule which sets up the *Cipher's
// pi and substitution tables for calls to Encrypt. This is used, primarily,
// by the bcrypt package to reuse the Blowfish key schedule during its
// set up. It's unlikely that you need to use this directly.
func ExpandKey(key []byte, c *Cipher) {
j := 0
for i := 0; i < 18; i++ {
// Using inlined getNextWord for performance.
var d uint32
for k := 0; k < 4; k++ {
d = d<<8 | uint32(key[j])
j++
if j >= len(key) {
j = 0
}
}
c.p[i] ^= d
}
var l, r uint32
for i := 0; i < 18; i += 2 {
l, r = encryptBlock(l, r, c)
c.p[i], c.p[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s0[i], c.s0[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s1[i], c.s1[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s2[i], c.s2[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l, r = encryptBlock(l, r, c)
c.s3[i], c.s3[i+1] = l, r
}
}
// This is similar to ExpandKey, but folds the salt during the key
// schedule. While ExpandKey is essentially expandKeyWithSalt with an all-zero
// salt passed in, reusing ExpandKey turns out to be a place of inefficiency
// and specializing it here is useful.
func expandKeyWithSalt(key []byte, salt []byte, c *Cipher) {
j := 0
for i := 0; i < 18; i++ {
c.p[i] ^= getNextWord(key, &j)
}
j = 0
var l, r uint32
for i := 0; i < 18; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.p[i], c.p[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s0[i], c.s0[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s1[i], c.s1[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s2[i], c.s2[i+1] = l, r
}
for i := 0; i < 256; i += 2 {
l ^= getNextWord(salt, &j)
r ^= getNextWord(salt, &j)
l, r = encryptBlock(l, r, c)
c.s3[i], c.s3[i+1] = l, r
}
}
func encryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
xl, xr := l, r
xl ^= c.p[0]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[1]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[2]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[3]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[4]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[5]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[6]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[7]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[8]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[9]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[10]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[11]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[12]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[13]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[14]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[15]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[16]
xr ^= c.p[17]
return xr, xl
}
func decryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
xl, xr := l, r
xl ^= c.p[17]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[16]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[15]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[14]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[13]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[12]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[11]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[10]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[9]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[8]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[7]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[6]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[5]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[4]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[3]
xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[2]
xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[1]
xr ^= c.p[0]
return xr, xl
}

View File

@@ -0,0 +1,274 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blowfish
import "testing"
type CryptTest struct {
key []byte
in []byte
out []byte
}
// Test vector values are from http://www.schneier.com/code/vectors.txt.
var encryptTests = []CryptTest{
{
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78}},
{
[]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
[]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
[]byte{0x51, 0x86, 0x6F, 0xD5, 0xB8, 0x5E, 0xCB, 0x8A}},
{
[]byte{0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
[]byte{0x7D, 0x85, 0x6F, 0x9A, 0x61, 0x30, 0x63, 0xF2}},
{
[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
[]byte{0x24, 0x66, 0xDD, 0x87, 0x8B, 0x96, 0x3C, 0x9D}},
{
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
[]byte{0x61, 0xF9, 0xC3, 0x80, 0x22, 0x81, 0xB0, 0x96}},
{
[]byte{0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11},
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0x7D, 0x0C, 0xC6, 0x30, 0xAF, 0xDA, 0x1E, 0xC7}},
{
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x4E, 0xF9, 0x97, 0x45, 0x61, 0x98, 0xDD, 0x78}},
{
[]byte{0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10},
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0x0A, 0xCE, 0xAB, 0x0F, 0xC6, 0xA0, 0xA2, 0x8D}},
{
[]byte{0x7C, 0xA1, 0x10, 0x45, 0x4A, 0x1A, 0x6E, 0x57},
[]byte{0x01, 0xA1, 0xD6, 0xD0, 0x39, 0x77, 0x67, 0x42},
[]byte{0x59, 0xC6, 0x82, 0x45, 0xEB, 0x05, 0x28, 0x2B}},
{
[]byte{0x01, 0x31, 0xD9, 0x61, 0x9D, 0xC1, 0x37, 0x6E},
[]byte{0x5C, 0xD5, 0x4C, 0xA8, 0x3D, 0xEF, 0x57, 0xDA},
[]byte{0xB1, 0xB8, 0xCC, 0x0B, 0x25, 0x0F, 0x09, 0xA0}},
{
[]byte{0x07, 0xA1, 0x13, 0x3E, 0x4A, 0x0B, 0x26, 0x86},
[]byte{0x02, 0x48, 0xD4, 0x38, 0x06, 0xF6, 0x71, 0x72},
[]byte{0x17, 0x30, 0xE5, 0x77, 0x8B, 0xEA, 0x1D, 0xA4}},
{
[]byte{0x38, 0x49, 0x67, 0x4C, 0x26, 0x02, 0x31, 0x9E},
[]byte{0x51, 0x45, 0x4B, 0x58, 0x2D, 0xDF, 0x44, 0x0A},
[]byte{0xA2, 0x5E, 0x78, 0x56, 0xCF, 0x26, 0x51, 0xEB}},
{
[]byte{0x04, 0xB9, 0x15, 0xBA, 0x43, 0xFE, 0xB5, 0xB6},
[]byte{0x42, 0xFD, 0x44, 0x30, 0x59, 0x57, 0x7F, 0xA2},
[]byte{0x35, 0x38, 0x82, 0xB1, 0x09, 0xCE, 0x8F, 0x1A}},
{
[]byte{0x01, 0x13, 0xB9, 0x70, 0xFD, 0x34, 0xF2, 0xCE},
[]byte{0x05, 0x9B, 0x5E, 0x08, 0x51, 0xCF, 0x14, 0x3A},
[]byte{0x48, 0xF4, 0xD0, 0x88, 0x4C, 0x37, 0x99, 0x18}},
{
[]byte{0x01, 0x70, 0xF1, 0x75, 0x46, 0x8F, 0xB5, 0xE6},
[]byte{0x07, 0x56, 0xD8, 0xE0, 0x77, 0x47, 0x61, 0xD2},
[]byte{0x43, 0x21, 0x93, 0xB7, 0x89, 0x51, 0xFC, 0x98}},
{
[]byte{0x43, 0x29, 0x7F, 0xAD, 0x38, 0xE3, 0x73, 0xFE},
[]byte{0x76, 0x25, 0x14, 0xB8, 0x29, 0xBF, 0x48, 0x6A},
[]byte{0x13, 0xF0, 0x41, 0x54, 0xD6, 0x9D, 0x1A, 0xE5}},
{
[]byte{0x07, 0xA7, 0x13, 0x70, 0x45, 0xDA, 0x2A, 0x16},
[]byte{0x3B, 0xDD, 0x11, 0x90, 0x49, 0x37, 0x28, 0x02},
[]byte{0x2E, 0xED, 0xDA, 0x93, 0xFF, 0xD3, 0x9C, 0x79}},
{
[]byte{0x04, 0x68, 0x91, 0x04, 0xC2, 0xFD, 0x3B, 0x2F},
[]byte{0x26, 0x95, 0x5F, 0x68, 0x35, 0xAF, 0x60, 0x9A},
[]byte{0xD8, 0x87, 0xE0, 0x39, 0x3C, 0x2D, 0xA6, 0xE3}},
{
[]byte{0x37, 0xD0, 0x6B, 0xB5, 0x16, 0xCB, 0x75, 0x46},
[]byte{0x16, 0x4D, 0x5E, 0x40, 0x4F, 0x27, 0x52, 0x32},
[]byte{0x5F, 0x99, 0xD0, 0x4F, 0x5B, 0x16, 0x39, 0x69}},
{
[]byte{0x1F, 0x08, 0x26, 0x0D, 0x1A, 0xC2, 0x46, 0x5E},
[]byte{0x6B, 0x05, 0x6E, 0x18, 0x75, 0x9F, 0x5C, 0xCA},
[]byte{0x4A, 0x05, 0x7A, 0x3B, 0x24, 0xD3, 0x97, 0x7B}},
{
[]byte{0x58, 0x40, 0x23, 0x64, 0x1A, 0xBA, 0x61, 0x76},
[]byte{0x00, 0x4B, 0xD6, 0xEF, 0x09, 0x17, 0x60, 0x62},
[]byte{0x45, 0x20, 0x31, 0xC1, 0xE4, 0xFA, 0xDA, 0x8E}},
{
[]byte{0x02, 0x58, 0x16, 0x16, 0x46, 0x29, 0xB0, 0x07},
[]byte{0x48, 0x0D, 0x39, 0x00, 0x6E, 0xE7, 0x62, 0xF2},
[]byte{0x75, 0x55, 0xAE, 0x39, 0xF5, 0x9B, 0x87, 0xBD}},
{
[]byte{0x49, 0x79, 0x3E, 0xBC, 0x79, 0xB3, 0x25, 0x8F},
[]byte{0x43, 0x75, 0x40, 0xC8, 0x69, 0x8F, 0x3C, 0xFA},
[]byte{0x53, 0xC5, 0x5F, 0x9C, 0xB4, 0x9F, 0xC0, 0x19}},
{
[]byte{0x4F, 0xB0, 0x5E, 0x15, 0x15, 0xAB, 0x73, 0xA7},
[]byte{0x07, 0x2D, 0x43, 0xA0, 0x77, 0x07, 0x52, 0x92},
[]byte{0x7A, 0x8E, 0x7B, 0xFA, 0x93, 0x7E, 0x89, 0xA3}},
{
[]byte{0x49, 0xE9, 0x5D, 0x6D, 0x4C, 0xA2, 0x29, 0xBF},
[]byte{0x02, 0xFE, 0x55, 0x77, 0x81, 0x17, 0xF1, 0x2A},
[]byte{0xCF, 0x9C, 0x5D, 0x7A, 0x49, 0x86, 0xAD, 0xB5}},
{
[]byte{0x01, 0x83, 0x10, 0xDC, 0x40, 0x9B, 0x26, 0xD6},
[]byte{0x1D, 0x9D, 0x5C, 0x50, 0x18, 0xF7, 0x28, 0xC2},
[]byte{0xD1, 0xAB, 0xB2, 0x90, 0x65, 0x8B, 0xC7, 0x78}},
{
[]byte{0x1C, 0x58, 0x7F, 0x1C, 0x13, 0x92, 0x4F, 0xEF},
[]byte{0x30, 0x55, 0x32, 0x28, 0x6D, 0x6F, 0x29, 0x5A},
[]byte{0x55, 0xCB, 0x37, 0x74, 0xD1, 0x3E, 0xF2, 0x01}},
{
[]byte{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01},
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0xFA, 0x34, 0xEC, 0x48, 0x47, 0xB2, 0x68, 0xB2}},
{
[]byte{0x1F, 0x1F, 0x1F, 0x1F, 0x0E, 0x0E, 0x0E, 0x0E},
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0xA7, 0x90, 0x79, 0x51, 0x08, 0xEA, 0x3C, 0xAE}},
{
[]byte{0xE0, 0xFE, 0xE0, 0xFE, 0xF1, 0xFE, 0xF1, 0xFE},
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0xC3, 0x9E, 0x07, 0x2D, 0x9F, 0xAC, 0x63, 0x1D}},
{
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
[]byte{0x01, 0x49, 0x33, 0xE0, 0xCD, 0xAF, 0xF6, 0xE4}},
{
[]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0xF2, 0x1E, 0x9A, 0x77, 0xB7, 0x1C, 0x49, 0xBC}},
{
[]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF},
[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
[]byte{0x24, 0x59, 0x46, 0x88, 0x57, 0x54, 0x36, 0x9A}},
{
[]byte{0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10},
[]byte{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF},
[]byte{0x6B, 0x5C, 0x5A, 0x9C, 0x5D, 0x9E, 0x0A, 0x5A}},
}
func TestCipherEncrypt(t *testing.T) {
for i, tt := range encryptTests {
c, err := NewCipher(tt.key)
if err != nil {
t.Errorf("NewCipher(%d bytes) = %s", len(tt.key), err)
continue
}
ct := make([]byte, len(tt.out))
c.Encrypt(ct, tt.in)
for j, v := range ct {
if v != tt.out[j] {
t.Errorf("Cipher.Encrypt, test vector #%d: cipher-text[%d] = %#x, expected %#x", i, j, v, tt.out[j])
break
}
}
}
}
func TestCipherDecrypt(t *testing.T) {
for i, tt := range encryptTests {
c, err := NewCipher(tt.key)
if err != nil {
t.Errorf("NewCipher(%d bytes) = %s", len(tt.key), err)
continue
}
pt := make([]byte, len(tt.in))
c.Decrypt(pt, tt.out)
for j, v := range pt {
if v != tt.in[j] {
t.Errorf("Cipher.Decrypt, test vector #%d: plain-text[%d] = %#x, expected %#x", i, j, v, tt.in[j])
break
}
}
}
}
func TestSaltedCipherKeyLength(t *testing.T) {
if _, err := NewSaltedCipher(nil, []byte{'a'}); err != KeySizeError(0) {
t.Errorf("NewSaltedCipher with short key, gave error %#v, expected %#v", err, KeySizeError(0))
}
// A 57-byte key. One over the typical blowfish restriction.
key := []byte("012345678901234567890123456789012345678901234567890123456")
if _, err := NewSaltedCipher(key, []byte{'a'}); err != nil {
t.Errorf("NewSaltedCipher with long key, gave error %#v", err)
}
}
// Test vectors generated with Blowfish from OpenSSH.
var saltedVectors = [][8]byte{
{0x0c, 0x82, 0x3b, 0x7b, 0x8d, 0x01, 0x4b, 0x7e},
{0xd1, 0xe1, 0x93, 0xf0, 0x70, 0xa6, 0xdb, 0x12},
{0xfc, 0x5e, 0xba, 0xde, 0xcb, 0xf8, 0x59, 0xad},
{0x8a, 0x0c, 0x76, 0xe7, 0xdd, 0x2c, 0xd3, 0xa8},
{0x2c, 0xcb, 0x7b, 0xee, 0xac, 0x7b, 0x7f, 0xf8},
{0xbb, 0xf6, 0x30, 0x6f, 0xe1, 0x5d, 0x62, 0xbf},
{0x97, 0x1e, 0xc1, 0x3d, 0x3d, 0xe0, 0x11, 0xe9},
{0x06, 0xd7, 0x4d, 0xb1, 0x80, 0xa3, 0xb1, 0x38},
{0x67, 0xa1, 0xa9, 0x75, 0x0e, 0x5b, 0xc6, 0xb4},
{0x51, 0x0f, 0x33, 0x0e, 0x4f, 0x67, 0xd2, 0x0c},
{0xf1, 0x73, 0x7e, 0xd8, 0x44, 0xea, 0xdb, 0xe5},
{0x14, 0x0e, 0x16, 0xce, 0x7f, 0x4a, 0x9c, 0x7b},
{0x4b, 0xfe, 0x43, 0xfd, 0xbf, 0x36, 0x04, 0x47},
{0xb1, 0xeb, 0x3e, 0x15, 0x36, 0xa7, 0xbb, 0xe2},
{0x6d, 0x0b, 0x41, 0xdd, 0x00, 0x98, 0x0b, 0x19},
{0xd3, 0xce, 0x45, 0xce, 0x1d, 0x56, 0xb7, 0xfc},
{0xd9, 0xf0, 0xfd, 0xda, 0xc0, 0x23, 0xb7, 0x93},
{0x4c, 0x6f, 0xa1, 0xe4, 0x0c, 0xa8, 0xca, 0x57},
{0xe6, 0x2f, 0x28, 0xa7, 0x0c, 0x94, 0x0d, 0x08},
{0x8f, 0xe3, 0xf0, 0xb6, 0x29, 0xe3, 0x44, 0x03},
{0xff, 0x98, 0xdd, 0x04, 0x45, 0xb4, 0x6d, 0x1f},
{0x9e, 0x45, 0x4d, 0x18, 0x40, 0x53, 0xdb, 0xef},
{0xb7, 0x3b, 0xef, 0x29, 0xbe, 0xa8, 0x13, 0x71},
{0x02, 0x54, 0x55, 0x41, 0x8e, 0x04, 0xfc, 0xad},
{0x6a, 0x0a, 0xee, 0x7c, 0x10, 0xd9, 0x19, 0xfe},
{0x0a, 0x22, 0xd9, 0x41, 0xcc, 0x23, 0x87, 0x13},
{0x6e, 0xff, 0x1f, 0xff, 0x36, 0x17, 0x9c, 0xbe},
{0x79, 0xad, 0xb7, 0x40, 0xf4, 0x9f, 0x51, 0xa6},
{0x97, 0x81, 0x99, 0xa4, 0xde, 0x9e, 0x9f, 0xb6},
{0x12, 0x19, 0x7a, 0x28, 0xd0, 0xdc, 0xcc, 0x92},
{0x81, 0xda, 0x60, 0x1e, 0x0e, 0xdd, 0x65, 0x56},
{0x7d, 0x76, 0x20, 0xb2, 0x73, 0xc9, 0x9e, 0xee},
}
func TestSaltedCipher(t *testing.T) {
var key, salt [32]byte
for i := range key {
key[i] = byte(i)
salt[i] = byte(i + 32)
}
for i, v := range saltedVectors {
c, err := NewSaltedCipher(key[:], salt[:i])
if err != nil {
t.Fatal(err)
}
var buf [8]byte
c.Encrypt(buf[:], buf[:])
if v != buf {
t.Errorf("%d: expected %x, got %x", i, v, buf)
}
}
}
func BenchmarkExpandKeyWithSalt(b *testing.B) {
key := make([]byte, 32)
salt := make([]byte, 16)
c, _ := NewCipher(key)
for i := 0; i < b.N; i++ {
expandKeyWithSalt(key, salt, c)
}
}
func BenchmarkExpandKey(b *testing.B) {
key := make([]byte, 32)
c, _ := NewCipher(key)
for i := 0; i < b.N; i++ {
ExpandKey(key, c)
}
}

View File

@@ -0,0 +1,91 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blowfish implements Bruce Schneier's Blowfish encryption algorithm.
package blowfish
// The code is a port of Bruce Schneier's C implementation.
// See http://www.schneier.com/blowfish.html.
import "strconv"
// The Blowfish block size in bytes.
const BlockSize = 8
// A Cipher is an instance of Blowfish encryption using a particular key.
type Cipher struct {
p [18]uint32
s0, s1, s2, s3 [256]uint32
}
type KeySizeError int
func (k KeySizeError) Error() string {
return "crypto/blowfish: invalid key size " + strconv.Itoa(int(k))
}
// NewCipher creates and returns a Cipher.
// The key argument should be the Blowfish key, from 1 to 56 bytes.
func NewCipher(key []byte) (*Cipher, error) {
var result Cipher
if k := len(key); k < 1 || k > 56 {
return nil, KeySizeError(k)
}
initCipher(&result)
ExpandKey(key, &result)
return &result, nil
}
// NewSaltedCipher creates a returns a Cipher that folds a salt into its key
// schedule. For most purposes, NewCipher, instead of NewSaltedCipher, is
// sufficient and desirable. For bcrypt compatiblity, the key can be over 56
// bytes.
func NewSaltedCipher(key, salt []byte) (*Cipher, error) {
if len(salt) == 0 {
return NewCipher(key)
}
var result Cipher
if k := len(key); k < 1 {
return nil, KeySizeError(k)
}
initCipher(&result)
expandKeyWithSalt(key, salt, &result)
return &result, nil
}
// BlockSize returns the Blowfish block size, 8 bytes.
// It is necessary to satisfy the Block interface in the
// package "crypto/cipher".
func (c *Cipher) BlockSize() int { return BlockSize }
// Encrypt encrypts the 8-byte buffer src using the key k
// and stores the result in dst.
// Note that for amounts of data larger than a block,
// it is not safe to just call Encrypt on successive blocks;
// instead, use an encryption mode like CBC (see crypto/cipher/cbc.go).
func (c *Cipher) Encrypt(dst, src []byte) {
l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
l, r = encryptBlock(l, r, c)
dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
}
// Decrypt decrypts the 8-byte buffer src using the key k
// and stores the result in dst.
func (c *Cipher) Decrypt(dst, src []byte) {
l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
l, r = decryptBlock(l, r, c)
dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
}
func initCipher(c *Cipher) {
copy(c.p[0:], p[0:])
copy(c.s0[0:], s0[0:])
copy(c.s1[0:], s1[0:])
copy(c.s2[0:], s2[0:])
copy(c.s3[0:], s3[0:])
}

View File

@@ -0,0 +1,199 @@
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The startup permutation array and substitution boxes.
// They are the hexadecimal digits of PI; see:
// http://www.schneier.com/code/constants.txt.
package blowfish
var s0 = [256]uint32{
0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96,
0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16,
0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658,
0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013,
0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e,
0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60,
0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6,
0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a,
0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c,
0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193,
0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1,
0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239,
0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a,
0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3,
0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176,
0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe,
0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706,
0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b,
0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b,
0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463,
0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c,
0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3,
0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a,
0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8,
0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760,
0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db,
0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8,
0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b,
0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33,
0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4,
0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0,
0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c,
0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777,
0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299,
0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705,
0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf,
0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e,
0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa,
0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9,
0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915,
0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f,
0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664,
0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a,
}
var s1 = [256]uint32{
0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d,
0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1,
0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65,
0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1,
0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9,
0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737,
0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d,
0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd,
0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc,
0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41,
0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908,
0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af,
0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124,
0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c,
0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908,
0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd,
0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b,
0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e,
0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa,
0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a,
0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d,
0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66,
0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5,
0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84,
0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96,
0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14,
0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca,
0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7,
0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77,
0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99,
0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054,
0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73,
0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea,
0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105,
0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646,
0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285,
0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea,
0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb,
0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e,
0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc,
0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd,
0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20,
0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7,
}
var s2 = [256]uint32{
0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7,
0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af,
0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af,
0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504,
0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4,
0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee,
0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec,
0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b,
0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332,
0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527,
0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58,
0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c,
0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22,
0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17,
0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60,
0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115,
0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99,
0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0,
0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74,
0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d,
0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3,
0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3,
0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979,
0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c,
0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa,
0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a,
0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086,
0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc,
0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24,
0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2,
0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84,
0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c,
0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09,
0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10,
0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe,
0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027,
0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0,
0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634,
0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188,
0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc,
0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8,
0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837,
0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0,
}
var s3 = [256]uint32{
0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742,
0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b,
0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79,
0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6,
0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a,
0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4,
0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1,
0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59,
0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797,
0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28,
0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6,
0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28,
0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba,
0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a,
0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5,
0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f,
0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce,
0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680,
0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd,
0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb,
0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb,
0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370,
0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc,
0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048,
0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc,
0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9,
0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a,
0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f,
0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a,
0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1,
0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b,
0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e,
0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e,
0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f,
0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623,
0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc,
0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a,
0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6,
0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3,
0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060,
0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c,
0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f,
0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6,
}
var p = [18]uint32{
0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0,
0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b,
}

View File

@@ -0,0 +1,37 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package transform_test
import (
"fmt"
"unicode"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
func ExampleRemoveFunc() {
input := []byte(`tschüß; до свидания`)
b := make([]byte, len(input))
t := transform.RemoveFunc(unicode.IsSpace)
n, _, _ := t.Transform(b, input, true)
fmt.Println(string(b[:n]))
t = transform.RemoveFunc(func(r rune) bool {
return !unicode.Is(unicode.Latin, r)
})
n, _, _ = t.Transform(b, input, true)
fmt.Println(string(b[:n]))
n, _, _ = t.Transform(b, norm.NFD.Bytes(input), true)
fmt.Println(string(b[:n]))
// Output:
// tschüß;досвидания
// tschüß
// tschuß
}

View File

@@ -0,0 +1,616 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package transform provides reader and writer wrappers that transform the
// bytes passing through as well as various transformations. Example
// transformations provided by other packages include normalization and
// conversion between character sets.
package transform
import (
"bytes"
"errors"
"io"
"unicode/utf8"
)
var (
// ErrShortDst means that the destination buffer was too short to
// receive all of the transformed bytes.
ErrShortDst = errors.New("transform: short destination buffer")
// ErrShortSrc means that the source buffer has insufficient data to
// complete the transformation.
ErrShortSrc = errors.New("transform: short source buffer")
// errInconsistentByteCount means that Transform returned success (nil
// error) but also returned nSrc inconsistent with the src argument.
errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
// errShortInternal means that an internal buffer is not large enough
// to make progress and the Transform operation must be aborted.
errShortInternal = errors.New("transform: short internal buffer")
)
// Transformer transforms bytes.
type Transformer interface {
// Transform writes to dst the transformed bytes read from src, and
// returns the number of dst bytes written and src bytes read. The
// atEOF argument tells whether src represents the last bytes of the
// input.
//
// Callers should always process the nDst bytes produced and account
// for the nSrc bytes consumed before considering the error err.
//
// A nil error means that all of the transformed bytes (whether freshly
// transformed from src or left over from previous Transform calls)
// were written to dst. A nil error can be returned regardless of
// whether atEOF is true. If err is nil then nSrc must equal len(src);
// the converse is not necessarily true.
//
// ErrShortDst means that dst was too short to receive all of the
// transformed bytes. ErrShortSrc means that src had insufficient data
// to complete the transformation. If both conditions apply, then
// either error may be returned. Other than the error conditions listed
// here, implementations are free to report other errors that arise.
Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
// Reset resets the state and allows a Transformer to be reused.
Reset()
}
// NopResetter can be embedded by implementations of Transformer to add a nop
// Reset method.
type NopResetter struct{}
// Reset implements the Reset method of the Transformer interface.
func (NopResetter) Reset() {}
// Reader wraps another io.Reader by transforming the bytes read.
type Reader struct {
r io.Reader
t Transformer
err error
// dst[dst0:dst1] contains bytes that have been transformed by t but
// not yet copied out via Read.
dst []byte
dst0, dst1 int
// src[src0:src1] contains bytes that have been read from r but not
// yet transformed through t.
src []byte
src0, src1 int
// transformComplete is whether the transformation is complete,
// regardless of whether or not it was successful.
transformComplete bool
}
const defaultBufSize = 4096
// NewReader returns a new Reader that wraps r by transforming the bytes read
// via t. It calls Reset on t.
func NewReader(r io.Reader, t Transformer) *Reader {
t.Reset()
return &Reader{
r: r,
t: t,
dst: make([]byte, defaultBufSize),
src: make([]byte, defaultBufSize),
}
}
// Read implements the io.Reader interface.
func (r *Reader) Read(p []byte) (int, error) {
n, err := 0, error(nil)
for {
// Copy out any transformed bytes and return the final error if we are done.
if r.dst0 != r.dst1 {
n = copy(p, r.dst[r.dst0:r.dst1])
r.dst0 += n
if r.dst0 == r.dst1 && r.transformComplete {
return n, r.err
}
return n, nil
} else if r.transformComplete {
return 0, r.err
}
// Try to transform some source bytes, or to flush the transformer if we
// are out of source bytes. We do this even if r.r.Read returned an error.
// As the io.Reader documentation says, "process the n > 0 bytes returned
// before considering the error".
if r.src0 != r.src1 || r.err != nil {
r.dst0 = 0
r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
r.src0 += n
switch {
case err == nil:
if r.src0 != r.src1 {
r.err = errInconsistentByteCount
}
// The Transform call was successful; we are complete if we
// cannot read more bytes into src.
r.transformComplete = r.err != nil
continue
case err == ErrShortDst && (r.dst1 != 0 || n != 0):
// Make room in dst by copying out, and try again.
continue
case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
// Read more bytes into src via the code below, and try again.
default:
r.transformComplete = true
// The reader error (r.err) takes precedence over the
// transformer error (err) unless r.err is nil or io.EOF.
if r.err == nil || r.err == io.EOF {
r.err = err
}
continue
}
}
// Move any untransformed source bytes to the start of the buffer
// and read more bytes.
if r.src0 != 0 {
r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
}
n, r.err = r.r.Read(r.src[r.src1:])
r.src1 += n
}
}
// TODO: implement ReadByte (and ReadRune??).
// Writer wraps another io.Writer by transforming the bytes read.
// The user needs to call Close to flush unwritten bytes that may
// be buffered.
type Writer struct {
w io.Writer
t Transformer
dst []byte
// src[:n] contains bytes that have not yet passed through t.
src []byte
n int
}
// NewWriter returns a new Writer that wraps w by transforming the bytes written
// via t. It calls Reset on t.
func NewWriter(w io.Writer, t Transformer) *Writer {
t.Reset()
return &Writer{
w: w,
t: t,
dst: make([]byte, defaultBufSize),
src: make([]byte, defaultBufSize),
}
}
// Write implements the io.Writer interface. If there are not enough
// bytes available to complete a Transform, the bytes will be buffered
// for the next write. Call Close to convert the remaining bytes.
func (w *Writer) Write(data []byte) (n int, err error) {
src := data
if w.n > 0 {
// Append bytes from data to the last remainder.
// TODO: limit the amount copied on first try.
n = copy(w.src[w.n:], data)
w.n += n
src = w.src[:w.n]
}
for {
nDst, nSrc, err := w.t.Transform(w.dst, src, false)
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
return n, werr
}
src = src[nSrc:]
if w.n > 0 && len(src) <= n {
// Enough bytes from w.src have been consumed. We make src point
// to data instead to reduce the copying.
w.n = 0
n -= len(src)
src = data[n:]
if n < len(data) && (err == nil || err == ErrShortSrc) {
continue
}
} else {
n += nSrc
}
switch {
case err == ErrShortDst && (nDst > 0 || nSrc > 0):
case err == ErrShortSrc && len(src) < len(w.src):
m := copy(w.src, src)
// If w.n > 0, bytes from data were already copied to w.src and n
// was already set to the number of bytes consumed.
if w.n == 0 {
n += m
}
w.n = m
return n, nil
case err == nil && w.n > 0:
return n, errInconsistentByteCount
default:
return n, err
}
}
}
// Close implements the io.Closer interface.
func (w *Writer) Close() error {
for src := w.src[:w.n]; len(src) > 0; {
nDst, nSrc, err := w.t.Transform(w.dst, src, true)
if nDst == 0 {
return err
}
if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
return werr
}
if err != ErrShortDst {
return err
}
src = src[nSrc:]
}
return nil
}
type nop struct{ NopResetter }
func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := copy(dst, src)
if n < len(src) {
err = ErrShortDst
}
return n, n, err
}
type discard struct{ NopResetter }
func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return 0, len(src), nil
}
var (
// Discard is a Transformer for which all Transform calls succeed
// by consuming all bytes and writing nothing.
Discard Transformer = discard{}
// Nop is a Transformer that copies src to dst.
Nop Transformer = nop{}
)
// chain is a sequence of links. A chain with N Transformers has N+1 links and
// N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
// buffers given to chain.Transform and the middle N-1 buffers are intermediate
// buffers owned by the chain. The i'th link transforms bytes from the i'th
// buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
// chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
type chain struct {
link []link
err error
// errStart is the index at which the error occurred plus 1. Processing
// errStart at this level at the next call to Transform. As long as
// errStart > 0, chain will not consume any more source bytes.
errStart int
}
func (c *chain) fatalError(errIndex int, err error) {
if i := errIndex + 1; i > c.errStart {
c.errStart = i
c.err = err
}
}
type link struct {
t Transformer
// b[p:n] holds the bytes to be transformed by t.
b []byte
p int
n int
}
func (l *link) src() []byte {
return l.b[l.p:l.n]
}
func (l *link) dst() []byte {
return l.b[l.n:]
}
// Chain returns a Transformer that applies t in sequence.
func Chain(t ...Transformer) Transformer {
if len(t) == 0 {
return nop{}
}
c := &chain{link: make([]link, len(t)+1)}
for i, tt := range t {
c.link[i].t = tt
}
// Allocate intermediate buffers.
b := make([][defaultBufSize]byte, len(t)-1)
for i := range b {
c.link[i+1].b = b[i][:]
}
return c
}
// Reset resets the state of Chain. It calls Reset on all the Transformers.
func (c *chain) Reset() {
for i, l := range c.link {
if l.t != nil {
l.t.Reset()
}
c.link[i].p, c.link[i].n = 0, 0
}
}
// Transform applies the transformers of c in sequence.
func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
// Set up src and dst in the chain.
srcL := &c.link[0]
dstL := &c.link[len(c.link)-1]
srcL.b, srcL.p, srcL.n = src, 0, len(src)
dstL.b, dstL.n = dst, 0
var lastFull, needProgress bool // for detecting progress
// i is the index of the next Transformer to apply, for i in [low, high].
// low is the lowest index for which c.link[low] may still produce bytes.
// high is the highest index for which c.link[high] has a Transformer.
// The error returned by Transform determines whether to increase or
// decrease i. We try to completely fill a buffer before converting it.
for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
in, out := &c.link[i], &c.link[i+1]
nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
out.n += nDst
in.p += nSrc
if i > 0 && in.p == in.n {
in.p, in.n = 0, 0
}
needProgress, lastFull = lastFull, false
switch err0 {
case ErrShortDst:
// Process the destination buffer next. Return if we are already
// at the high index.
if i == high {
return dstL.n, srcL.p, ErrShortDst
}
if out.n != 0 {
i++
// If the Transformer at the next index is not able to process any
// source bytes there is nothing that can be done to make progress
// and the bytes will remain unprocessed. lastFull is used to
// detect this and break out of the loop with a fatal error.
lastFull = true
continue
}
// The destination buffer was too small, but is completely empty.
// Return a fatal error as this transformation can never complete.
c.fatalError(i, errShortInternal)
case ErrShortSrc:
if i == 0 {
// Save ErrShortSrc in err. All other errors take precedence.
err = ErrShortSrc
break
}
// Source bytes were depleted before filling up the destination buffer.
// Verify we made some progress, move the remaining bytes to the errStart
// and try to get more source bytes.
if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
// There were not enough source bytes to proceed while the source
// buffer cannot hold any more bytes. Return a fatal error as this
// transformation can never complete.
c.fatalError(i, errShortInternal)
break
}
// in.b is an internal buffer and we can make progress.
in.p, in.n = 0, copy(in.b, in.src())
fallthrough
case nil:
// if i == low, we have depleted the bytes at index i or any lower levels.
// In that case we increase low and i. In all other cases we decrease i to
// fetch more bytes before proceeding to the next index.
if i > low {
i--
continue
}
default:
c.fatalError(i, err0)
}
// Exhausted level low or fatal error: increase low and continue
// to process the bytes accepted so far.
i++
low = i
}
// If c.errStart > 0, this means we found a fatal error. We will clear
// all upstream buffers. At this point, no more progress can be made
// downstream, as Transform would have bailed while handling ErrShortDst.
if c.errStart > 0 {
for i := 1; i < c.errStart; i++ {
c.link[i].p, c.link[i].n = 0, 0
}
err, c.errStart, c.err = c.err, 0, nil
}
return dstL.n, srcL.p, err
}
// RemoveFunc returns a Transformer that removes from the input all runes r for
// which f(r) is true. Illegal bytes in the input are replaced by RuneError.
func RemoveFunc(f func(r rune) bool) Transformer {
return removeF(f)
}
type removeF func(r rune) bool
func (removeF) Reset() {}
// Transform implements the Transformer interface.
func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
if r = rune(src[0]); r < utf8.RuneSelf {
sz = 1
} else {
r, sz = utf8.DecodeRune(src)
if sz == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src) {
err = ErrShortSrc
break
}
// We replace illegal bytes with RuneError. Not doing so might
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
// The resulting byte sequence may subsequently contain runes
// for which t(r) is true that were passed unnoticed.
if !t(r) {
if nDst+3 > len(dst) {
err = ErrShortDst
break
}
nDst += copy(dst[nDst:], "\uFFFD")
}
nSrc++
continue
}
}
if !t(r) {
if nDst+sz > len(dst) {
err = ErrShortDst
break
}
nDst += copy(dst[nDst:], src[:sz])
}
nSrc += sz
}
return
}
// grow returns a new []byte that is longer than b, and copies the first n bytes
// of b to the start of the new slice.
func grow(b []byte, n int) []byte {
m := len(b)
if m <= 256 {
m *= 2
} else {
m += m >> 1
}
buf := make([]byte, m)
copy(buf, b[:n])
return buf
}
const initialBufSize = 128
// String returns a string with the result of converting s[:n] using t, where
// n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
func String(t Transformer, s string) (result string, n int, err error) {
if s == "" {
return "", 0, nil
}
t.Reset()
// Allocate only once. Note that both dst and src escape when passed to
// Transform.
buf := [2 * initialBufSize]byte{}
dst := buf[:initialBufSize:initialBufSize]
src := buf[initialBufSize : 2*initialBufSize]
// Avoid allocation if the transformed string is identical to the original.
// After this loop, pDst will point to the furthest point in s for which it
// could be detected that t gives equal results, src[:nSrc] will
// indicated the last processed chunk of s for which the output is not equal
// and dst[:nDst] will be the transform of this chunk.
var nDst, nSrc int
pDst := 0 // Used as index in both src and dst in this loop.
for {
n := copy(src, s[pDst:])
nDst, nSrc, err = t.Transform(dst, src[:n], pDst+n == len(s))
// Note 1: we will not enter the loop with pDst == len(s) and we will
// not end the loop with it either. So if nSrc is 0, this means there is
// some kind of error from which we cannot recover given the current
// buffer sizes. We will give up in this case.
// Note 2: it is not entirely correct to simply do a bytes.Equal as
// a Transformer may buffer internally. It will work in most cases,
// though, and no harm is done if it doesn't work.
// TODO: let transformers implement an optional Spanner interface, akin
// to norm's QuickSpan. This would even allow us to avoid any allocation.
if nSrc == 0 || !bytes.Equal(dst[:nDst], src[:nSrc]) {
break
}
if pDst += nDst; pDst == len(s) {
return s, pDst, nil
}
}
// Move the bytes seen so far to dst.
pSrc := pDst + nSrc
if pDst+nDst <= initialBufSize {
copy(dst[pDst:], dst[:nDst])
} else {
b := make([]byte, len(s)+nDst-nSrc)
copy(b[pDst:], dst[:nDst])
dst = b
}
copy(dst, s[:pDst])
pDst += nDst
if err != nil && err != ErrShortDst && err != ErrShortSrc {
return string(dst[:pDst]), pSrc, err
}
// Complete the string with the remainder.
for {
n := copy(src, s[pSrc:])
nDst, nSrc, err = t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
pDst += nDst
pSrc += nSrc
switch err {
case nil:
if pSrc == len(s) {
return string(dst[:pDst]), pSrc, nil
}
case ErrShortDst:
// Do not grow as long as we can make progress. This may avoid
// excessive allocations.
if nDst == 0 {
dst = grow(dst, pDst)
}
case ErrShortSrc:
if nSrc == 0 {
src = grow(src, 0)
}
default:
return string(dst[:pDst]), pSrc, err
}
}
}
// Bytes returns a new byte slice with the result of converting b[:n] using t,
// where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
t.Reset()
dst := make([]byte, len(b))
pDst, pSrc := 0, 0
for {
nDst, nSrc, err := t.Transform(dst[pDst:], b[pSrc:], true)
pDst += nDst
pSrc += nSrc
if err != ErrShortDst {
return dst[:pDst], pSrc, err
}
// Grow the destination buffer, but do not grow as long as we can make
// progress. This may avoid excessive allocations.
if nDst == 0 {
dst = grow(dst, pDst)
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
maketables: maketables.go triegen.go
go build $^
normregtest: normregtest.go
go build $^
tables: maketables
./maketables > tables.go
gofmt -w tables.go
# Downloads from www.unicode.org, so not part
# of standard test scripts.
test: testtables regtest
testtables: maketables
./maketables -test > data_test.go && go test -tags=test
regtest: normregtest
./normregtest

View File

@@ -0,0 +1,514 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "unicode/utf8"
const (
maxNonStarters = 30
// The maximum number of characters needed for a buffer is
// maxNonStarters + 1 for the starter + 1 for the GCJ
maxBufferSize = maxNonStarters + 2
maxNFCExpansion = 3 // NFC(0x1D160)
maxNFKCExpansion = 18 // NFKC(0xFDFA)
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
)
// ssState is used for reporting the segment state after inserting a rune.
// It is returned by streamSafe.next.
type ssState int
const (
// Indicates a rune was successfully added to the segment.
ssSuccess ssState = iota
// Indicates a rune starts a new segment and should not be added.
ssStarter
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
ssOverflow
)
// streamSafe implements the policy of when a CGJ should be inserted.
type streamSafe uint8
// mkStreamSafe is a shorthand for declaring a streamSafe var and calling
// first on it.
func mkStreamSafe(p Properties) streamSafe {
return streamSafe(p.nTrailingNonStarters())
}
// first inserts the first rune of a segment.
func (ss *streamSafe) first(p Properties) {
if *ss != 0 {
panic("!= 0")
}
*ss = streamSafe(p.nTrailingNonStarters())
}
// insert returns a ssState value to indicate whether a rune represented by p
// can be inserted.
func (ss *streamSafe) next(p Properties) ssState {
if *ss > maxNonStarters {
panic("streamSafe was not reset")
}
n := p.nLeadingNonStarters()
if *ss += streamSafe(n); *ss > maxNonStarters {
*ss = 0
return ssOverflow
}
// The Stream-Safe Text Processing prescribes that the counting can stop
// as soon as a starter is encountered. However, there are some starters,
// like Jamo V and T, that can combine with other runes, leaving their
// successive non-starters appended to the previous, possibly causing an
// overflow. We will therefore consider any rune with a non-zero nLead to
// be a non-starter. Note that it always hold that if nLead > 0 then
// nLead == nTrail.
if n == 0 {
*ss = 0
return ssStarter
}
return ssSuccess
}
// backwards is used for checking for overflow and segment starts
// when traversing a string backwards. Users do not need to call first
// for the first rune. The state of the streamSafe retains the count of
// the non-starters loaded.
func (ss *streamSafe) backwards(p Properties) ssState {
if *ss > maxNonStarters {
panic("streamSafe was not reset")
}
c := *ss + streamSafe(p.nTrailingNonStarters())
if c > maxNonStarters {
return ssOverflow
}
*ss = c
if p.nLeadingNonStarters() == 0 {
return ssStarter
}
return ssSuccess
}
func (ss streamSafe) isMax() bool {
return ss == maxNonStarters
}
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
const GraphemeJoiner = "\u034F"
// reorderBuffer is used to normalize a single segment. Characters inserted with
// insert are decomposed and reordered based on CCC. The compose method can
// be used to recombine characters. Note that the byte buffer does not hold
// the UTF-8 characters in order. Only the rune array is maintained in sorted
// order. flush writes the resulting segment to a byte array.
type reorderBuffer struct {
rune [maxBufferSize]Properties // Per character info.
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
nbyte uint8 // Number or bytes.
ss streamSafe // For limiting length of non-starter sequence.
nrune int // Number of runeInfos.
f formInfo
src input
nsrc int
tmpBytes input
out []byte
flushF func(*reorderBuffer) bool
}
func (rb *reorderBuffer) init(f Form, src []byte) {
rb.f = *formTable[f]
rb.src.setBytes(src)
rb.nsrc = len(src)
rb.ss = 0
}
func (rb *reorderBuffer) initString(f Form, src string) {
rb.f = *formTable[f]
rb.src.setString(src)
rb.nsrc = len(src)
rb.ss = 0
}
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
rb.out = out
rb.flushF = f
}
// reset discards all characters from the buffer.
func (rb *reorderBuffer) reset() {
rb.nrune = 0
rb.nbyte = 0
rb.ss = 0
}
func (rb *reorderBuffer) doFlush() bool {
if rb.f.composing {
rb.compose()
}
res := rb.flushF(rb)
rb.reset()
return res
}
// appendFlush appends the normalized segment to rb.out.
func appendFlush(rb *reorderBuffer) bool {
for i := 0; i < rb.nrune; i++ {
start := rb.rune[i].pos
end := start + rb.rune[i].size
rb.out = append(rb.out, rb.byte[start:end]...)
}
return true
}
// flush appends the normalized segment to out and resets rb.
func (rb *reorderBuffer) flush(out []byte) []byte {
for i := 0; i < rb.nrune; i++ {
start := rb.rune[i].pos
end := start + rb.rune[i].size
out = append(out, rb.byte[start:end]...)
}
rb.reset()
return out
}
// flushCopy copies the normalized segment to buf and resets rb.
// It returns the number of bytes written to buf.
func (rb *reorderBuffer) flushCopy(buf []byte) int {
p := 0
for i := 0; i < rb.nrune; i++ {
runep := rb.rune[i]
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size])
}
rb.reset()
return p
}
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
// It returns false if the buffer is not large enough to hold the rune.
// It is used internally by insert and insertString only.
func (rb *reorderBuffer) insertOrdered(info Properties) {
n := rb.nrune
b := rb.rune[:]
cc := info.ccc
if cc > 0 {
// Find insertion position + move elements to make room.
for ; n > 0; n-- {
if b[n-1].ccc <= cc {
break
}
b[n] = b[n-1]
}
}
rb.nrune += 1
pos := uint8(rb.nbyte)
rb.nbyte += utf8.UTFMax
info.pos = pos
b[n] = info
}
// insertErr is an error code returned by insert. Using this type instead
// of error improves performance up to 20% for many of the benchmarks.
type insertErr int
const (
iSuccess insertErr = -iota
iShortDst
iShortSrc
)
// insertFlush inserts the given rune in the buffer ordered by CCC.
// If a decomposition with multiple segments are encountered, they leading
// ones are flushed.
// It returns a non-zero error code if the rune was not inserted.
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr {
if rune := src.hangul(i); rune != 0 {
rb.decomposeHangul(rune)
return iSuccess
}
if info.hasDecomposition() {
return rb.insertDecomposed(info.Decomposition())
}
rb.insertSingle(src, i, info)
return iSuccess
}
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
// It is assumed there is sufficient space to hold the runes. It is the
// responsibility of the caller to ensure this. This can be done by checking
// the state returned by the streamSafe type.
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
if rune := src.hangul(i); rune != 0 {
rb.decomposeHangul(rune)
}
if info.hasDecomposition() {
// TODO: inline.
rb.insertDecomposed(info.Decomposition())
} else {
rb.insertSingle(src, i, info)
}
}
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
// It flushes the buffer on each new segment start.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
rb.tmpBytes.setBytes(dcomp)
for i := 0; i < len(dcomp); {
info := rb.f.info(rb.tmpBytes, i)
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
return iShortDst
}
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)])
rb.insertOrdered(info)
}
return iSuccess
}
// insertSingle inserts an entry in the reorderBuffer for the rune at
// position i. info is the runeInfo for the rune at position i.
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) {
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size))
rb.insertOrdered(info)
}
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
func (rb *reorderBuffer) insertCGJ() {
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})
}
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
func (rb *reorderBuffer) appendRune(r rune) {
bn := rb.nbyte
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.nbyte += utf8.UTFMax
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)}
rb.nrune++
}
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
func (rb *reorderBuffer) assignRune(pos int, r rune) {
bn := rb.rune[pos].pos
sz := utf8.EncodeRune(rb.byte[bn:], rune(r))
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)}
}
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
func (rb *reorderBuffer) runeAt(n int) rune {
inf := rb.rune[n]
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size])
return r
}
// bytesAt returns the UTF-8 encoding of the rune at position n.
// It is used for Hangul and recomposition.
func (rb *reorderBuffer) bytesAt(n int) []byte {
inf := rb.rune[n]
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)]
}
// For Hangul we combine algorithmically, instead of using tables.
const (
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
hangulBase0 = 0xEA
hangulBase1 = 0xB0
hangulBase2 = 0x80
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
hangulEnd0 = 0xED
hangulEnd1 = 0x9E
hangulEnd2 = 0xA4
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
jamoLBase0 = 0xE1
jamoLBase1 = 0x84
jamoLEnd = 0x1113
jamoVBase = 0x1161
jamoVEnd = 0x1176
jamoTBase = 0x11A7
jamoTEnd = 0x11C3
jamoTCount = 28
jamoVCount = 21
jamoVTCount = 21 * 28
jamoLVTCount = 19 * 21 * 28
)
const hangulUTF8Size = 3
func isHangul(b []byte) bool {
if len(b) < hangulUTF8Size {
return false
}
b0 := b[0]
if b0 < hangulBase0 {
return false
}
b1 := b[1]
switch {
case b0 == hangulBase0:
return b1 >= hangulBase1
case b0 < hangulEnd0:
return true
case b0 > hangulEnd0:
return false
case b1 < hangulEnd1:
return true
}
return b1 == hangulEnd1 && b[2] < hangulEnd2
}
func isHangulString(b string) bool {
if len(b) < hangulUTF8Size {
return false
}
b0 := b[0]
if b0 < hangulBase0 {
return false
}
b1 := b[1]
switch {
case b0 == hangulBase0:
return b1 >= hangulBase1
case b0 < hangulEnd0:
return true
case b0 > hangulEnd0:
return false
case b1 < hangulEnd1:
return true
}
return b1 == hangulEnd1 && b[2] < hangulEnd2
}
// Caller must ensure len(b) >= 2.
func isJamoVT(b []byte) bool {
// True if (rune & 0xff00) == jamoLBase
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1
}
func isHangulWithoutJamoT(b []byte) bool {
c, _ := utf8.DecodeRune(b)
c -= hangulBase
return c < jamoLVTCount && c%jamoTCount == 0
}
// decomposeHangul writes the decomposed Hangul to buf and returns the number
// of bytes written. len(buf) should be at least 9.
func decomposeHangul(buf []byte, r rune) int {
const JamoUTF8Len = 3
r -= hangulBase
x := r % jamoTCount
r /= jamoTCount
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount)
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount)
if x != 0 {
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x)
return 3 * JamoUTF8Len
}
return 2 * JamoUTF8Len
}
// decomposeHangul algorithmically decomposes a Hangul rune into
// its Jamo components.
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
func (rb *reorderBuffer) decomposeHangul(r rune) {
r -= hangulBase
x := r % jamoTCount
r /= jamoTCount
rb.appendRune(jamoLBase + r/jamoVCount)
rb.appendRune(jamoVBase + r%jamoVCount)
if x != 0 {
rb.appendRune(jamoTBase + x)
}
}
// combineHangul algorithmically combines Jamo character components into Hangul.
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
func (rb *reorderBuffer) combineHangul(s, i, k int) {
b := rb.rune[:]
bn := rb.nrune
for ; i < bn; i++ {
cccB := b[k-1].ccc
cccC := b[i].ccc
if cccB == 0 {
s = k - 1
}
if s != k-1 && cccB >= cccC {
// b[i] is blocked by greater-equal cccX below it
b[k] = b[i]
k++
} else {
l := rb.runeAt(s) // also used to compare to hangulBase
v := rb.runeAt(i) // also used to compare to jamoT
switch {
case jamoLBase <= l && l < jamoLEnd &&
jamoVBase <= v && v < jamoVEnd:
// 11xx plus 116x to LV
rb.assignRune(s, hangulBase+
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount)
case hangulBase <= l && l < hangulEnd &&
jamoTBase < v && v < jamoTEnd &&
((l-hangulBase)%jamoTCount) == 0:
// ACxx plus 11Ax to LVT
rb.assignRune(s, l+v-jamoTBase)
default:
b[k] = b[i]
k++
}
}
}
rb.nrune = k
}
// compose recombines the runes in the buffer.
// It should only be used to recompose a single segment, as it will not
// handle alternations between Hangul and non-Hangul characters correctly.
func (rb *reorderBuffer) compose() {
// UAX #15, section X5 , including Corrigendum #5
// "In any character sequence beginning with starter S, a character C is
// blocked from S if and only if there is some character B between S
// and C, and either B is a starter or it has the same or higher
// combining class as C."
bn := rb.nrune
if bn == 0 {
return
}
k := 1
b := rb.rune[:]
for s, i := 0, 1; i < bn; i++ {
if isJamoVT(rb.bytesAt(i)) {
// Redo from start in Hangul mode. Necessary to support
// U+320E..U+321E in NFKC mode.
rb.combineHangul(s, i, k)
return
}
ii := b[i]
// We can only use combineForward as a filter if we later
// get the info for the combined character. This is more
// expensive than using the filter. Using combinesBackward()
// is safe.
if ii.combinesBackward() {
cccB := b[k-1].ccc
cccC := ii.ccc
blocked := false // b[i] blocked by starter or greater or equal CCC?
if cccB == 0 {
s = k - 1
} else {
blocked = s != k-1 && cccB >= cccC
}
if !blocked {
combined := combine(rb.runeAt(s), rb.runeAt(i))
if combined != 0 {
rb.assignRune(s, combined)
continue
}
}
}
b[k] = b[i]
k++
}
rb.nrune = k
}

View File

@@ -0,0 +1,130 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "testing"
// TestCase is used for most tests.
type TestCase struct {
in []rune
out []rune
}
func runTests(t *testing.T, name string, fm Form, tests []TestCase) {
rb := reorderBuffer{}
rb.init(fm, nil)
for i, test := range tests {
rb.setFlusher(nil, appendFlush)
for j, rune := range test.in {
b := []byte(string(rune))
src := inputBytes(b)
info := rb.f.info(src, 0)
if j == 0 {
rb.ss.first(info)
} else {
rb.ss.next(info)
}
if rb.insertFlush(src, 0, info) < 0 {
t.Errorf("%s:%d: insert failed for rune %d", name, i, j)
}
}
rb.doFlush()
was := string(rb.out)
want := string(test.out)
if len(was) != len(want) {
t.Errorf("%s:%d: length = %d; want %d", name, i, len(was), len(want))
}
if was != want {
k, pfx := pidx(was, want)
t.Errorf("%s:%d: \nwas %s%+q; \nwant %s%+q", name, i, pfx, was[k:], pfx, want[k:])
}
}
}
func TestFlush(t *testing.T) {
const (
hello = "Hello "
world = "world!"
)
buf := make([]byte, maxByteBufferSize)
p := copy(buf, hello)
out := buf[p:]
rb := reorderBuffer{}
rb.initString(NFC, world)
if i := rb.flushCopy(out); i != 0 {
t.Errorf("wrote bytes on flush of empty buffer. (len(out) = %d)", i)
}
for i := range world {
// No need to set streamSafe values for this test.
rb.insertFlush(rb.src, i, rb.f.info(rb.src, i))
n := rb.flushCopy(out)
out = out[n:]
p += n
}
was := buf[:p]
want := hello + world
if string(was) != want {
t.Errorf(`output after flush was "%s"; want "%s"`, string(was), want)
}
if rb.nrune != 0 {
t.Errorf("non-null size of info buffer (rb.nrune == %d)", rb.nrune)
}
if rb.nbyte != 0 {
t.Errorf("non-null size of byte buffer (rb.nbyte == %d)", rb.nbyte)
}
}
var insertTests = []TestCase{
{[]rune{'a'}, []rune{'a'}},
{[]rune{0x300}, []rune{0x300}},
{[]rune{0x300, 0x316}, []rune{0x316, 0x300}}, // CCC(0x300)==230; CCC(0x316)==220
{[]rune{0x316, 0x300}, []rune{0x316, 0x300}},
{[]rune{0x41, 0x316, 0x300}, []rune{0x41, 0x316, 0x300}},
{[]rune{0x41, 0x300, 0x316}, []rune{0x41, 0x316, 0x300}},
{[]rune{0x300, 0x316, 0x41}, []rune{0x316, 0x300, 0x41}},
{[]rune{0x41, 0x300, 0x40, 0x316}, []rune{0x41, 0x300, 0x40, 0x316}},
}
func TestInsert(t *testing.T) {
runTests(t, "TestInsert", NFD, insertTests)
}
var decompositionNFDTest = []TestCase{
{[]rune{0xC0}, []rune{0x41, 0x300}},
{[]rune{0xAC00}, []rune{0x1100, 0x1161}},
{[]rune{0x01C4}, []rune{0x01C4}},
{[]rune{0x320E}, []rune{0x320E}},
{[]rune("음ẻ과"), []rune{0x110B, 0x1173, 0x11B7, 0x65, 0x309, 0x1100, 0x116A}},
}
var decompositionNFKDTest = []TestCase{
{[]rune{0xC0}, []rune{0x41, 0x300}},
{[]rune{0xAC00}, []rune{0x1100, 0x1161}},
{[]rune{0x01C4}, []rune{0x44, 0x5A, 0x030C}},
{[]rune{0x320E}, []rune{0x28, 0x1100, 0x1161, 0x29}},
}
func TestDecomposition(t *testing.T) {
runTests(t, "TestDecompositionNFD", NFD, decompositionNFDTest)
runTests(t, "TestDecompositionNFKD", NFKD, decompositionNFKDTest)
}
var compositionTest = []TestCase{
{[]rune{0x41, 0x300}, []rune{0xC0}},
{[]rune{0x41, 0x316}, []rune{0x41, 0x316}},
{[]rune{0x41, 0x300, 0x35D}, []rune{0xC0, 0x35D}},
{[]rune{0x41, 0x316, 0x300}, []rune{0xC0, 0x316}},
// blocking starter
{[]rune{0x41, 0x316, 0x40, 0x300}, []rune{0x41, 0x316, 0x40, 0x300}},
{[]rune{0x1100, 0x1161}, []rune{0xAC00}},
// parenthesized Hangul, alternate between ASCII and Hangul.
{[]rune{0x28, 0x1100, 0x1161, 0x29}, []rune{0x28, 0xAC00, 0x29}},
}
func TestComposition(t *testing.T) {
runTests(t, "TestComposition", NFC, compositionTest)
}

View File

@@ -0,0 +1,82 @@
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm_test
import (
"bytes"
"fmt"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
// EqualSimple uses a norm.Iter to compare two non-normalized
// strings for equivalence.
func EqualSimple(a, b string) bool {
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
}
return ia.Done() && ib.Done()
}
// FindPrefix finds the longest common prefix of ASCII characters
// of a and b.
func FindPrefix(a, b string) int {
i := 0
for ; i < len(a) && i < len(b) && a[i] < utf8.RuneSelf && a[i] == b[i]; i++ {
}
return i
}
// EqualOpt is like EqualSimple, but optimizes the special
// case for ASCII characters.
func EqualOpt(a, b string) bool {
n := FindPrefix(a, b)
a, b = a[n:], b[n:]
var ia, ib norm.Iter
ia.InitString(norm.NFKD, a)
ib.InitString(norm.NFKD, b)
for !ia.Done() && !ib.Done() {
if !bytes.Equal(ia.Next(), ib.Next()) {
return false
}
if n := int64(FindPrefix(a[ia.Pos():], b[ib.Pos():])); n != 0 {
ia.Seek(n, 1)
ib.Seek(n, 1)
}
}
return ia.Done() && ib.Done()
}
var compareTests = []struct{ a, b string }{
{"aaa", "aaa"},
{"aaa", "aab"},
{"a\u0300a", "\u00E0a"},
{"a\u0300\u0320b", "a\u0320\u0300b"},
{"\u1E0A\u0323", "\x44\u0323\u0307"},
// A character that decomposes into multiple segments
// spans several iterations.
{"\u3304", "\u30A4\u30CB\u30F3\u30AF\u3099"},
}
func ExampleIter() {
for i, t := range compareTests {
r0 := EqualSimple(t.a, t.b)
r1 := EqualOpt(t.a, t.b)
fmt.Printf("%d: %v %v\n", i, r0, r1)
}
// Output:
// 0: true true
// 1: false false
// 2: true true
// 3: true true
// 4: true true
// 5: true true
}

View File

@@ -0,0 +1,256 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 15: 1 (inverse of NFD_QD bit of qcInfo)
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
// 6..0: ccc (compressed CCC value).
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this length byte correspond
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc. The value of tccc itself is the
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
// are the number of trailing non-starters.
const (
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the length value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// Properties provides access to normalization properties of a rune.
type Properties struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
nLead uint8 // number of leading non-starters.
flags qcInfo // quick check flags
index uint16
}
// functions dispatchable per form
type lookupFunc func(b input, i int) Properties
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
composing, compatibility bool // form type
info lookupFunc
nextMain iterFunc
}
var formTable []*formInfo
func init() {
formTable = make([]*formInfo, 4)
for i := range formTable {
f := &formInfo{}
formTable[i] = f
f.form = Form(i)
if Form(i) == NFKD || Form(i) == NFKC {
f.compatibility = true
f.info = lookupInfoNFKC
} else {
f.info = lookupInfoNFC
}
f.nextMain = nextDecomposed
if Form(i) == NFC || Form(i) == NFKC {
f.nextMain = nextComposed
f.composing = true
}
}
}
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
// after 'a'. However, 'a' might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
// BoundaryBefore returns true if this rune starts a new segment and
// cannot combine with any rune on the left.
func (p Properties) BoundaryBefore() bool {
if p.ccc == 0 && !p.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
// BoundaryAfter returns true if runes cannot combine with or otherwise
// interact with this or previous runes.
func (p Properties) BoundaryAfter() bool {
// TODO: loosen these conditions.
return p.isInert()
}
// We pack quick check data in 4 bits:
// 5: Combines forward (0 == false, 1 == true)
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..0: Number of trailing non-starters.
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
type qcInfo uint8
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
func (p Properties) isInert() bool {
return p.flags&qcInfoMask == 0 && p.ccc == 0
}
func (p Properties) multiSegment() bool {
return p.index >= firstMulti && p.index < endMulti
}
func (p Properties) nLeadingNonStarters() uint8 {
return p.nLead
}
func (p Properties) nTrailingNonStarters() uint8 {
return uint8(p.flags & 0x03)
}
// Decomposition returns the decomposition for the underlying rune
// or nil if there is none.
func (p Properties) Decomposition() []byte {
// TODO: create the decomposition for Hangul?
if p.index == 0 {
return nil
}
i := p.index
n := decomps[i] & headerLenMask
i++
return decomps[i : i+uint16(n)]
}
// Size returns the length of UTF-8 encoding of the rune.
func (p Properties) Size() int {
return int(p.size)
}
// CCC returns the canonical combining class of the underlying rune.
func (p Properties) CCC() uint8 {
if p.index >= firstCCCZeroExcept {
return 0
}
return ccc[p.ccc]
}
// LeadCCC returns the CCC of the first rune in the decomposition.
// If there is no decomposition, LeadCCC equals CCC.
func (p Properties) LeadCCC() uint8 {
return ccc[p.ccc]
}
// TrailCCC returns the CCC of the last rune in the decomposition.
// If there is no decomposition, TrailCCC equals CCC.
func (p Properties) TrailCCC() uint8 {
return ccc[p.tccc]
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
return recompMap[key]
}
func lookupInfoNFC(b input, i int) Properties {
v, sz := b.charinfoNFC(i)
return compInfo(v, sz)
}
func lookupInfoNFKC(b input, i int) Properties {
v, sz := b.charinfoNFKC(i)
return compInfo(v, sz)
}
// Properties returns properties for the first rune in s.
func (f Form) Properties(s []byte) Properties {
if f == NFC || f == NFD {
return compInfo(nfcData.lookup(s))
}
return compInfo(nfkcData.lookup(s))
}
// PropertiesString returns properties for the first rune in s.
func (f Form) PropertiesString(s string) Properties {
if f == NFC || f == NFD {
return compInfo(nfcData.lookupString(s))
}
return compInfo(nfkcData.lookupString(s))
}
// compInfo converts the information contained in v and sz
// to a Properties. See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) Properties {
if v == 0 {
return Properties{size: uint8(sz)}
} else if v >= 0x8000 {
p := Properties{
size: uint8(sz),
ccc: uint8(v),
tccc: uint8(v),
flags: qcInfo(v >> 8),
}
if p.ccc > 0 || p.combinesBackward() {
p.nLead = uint8(p.flags & 0x3)
}
return p
}
// has decomposition
h := decomps[v]
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
p := Properties{size: uint8(sz), flags: f, index: v}
if v >= firstCCC {
v += uint16(h&headerLenMask) + 1
c := decomps[v]
p.tccc = c >> 2
p.flags |= qcInfo(c & 0x3)
if v >= firstLeadingCCC {
p.nLead = c & 0x3
if v >= firstStarterWithNLead {
// We were tricked. Remove the decomposition.
p.flags &= 0x03
p.index = 0
return p
}
p.ccc = decomps[v+1]
}
}
return p
}

View File

@@ -0,0 +1,54 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build test
package norm
import "testing"
func TestProperties(t *testing.T) {
var d runeData
CK := [2]string{"C", "K"}
for k, r := 1, rune(0); r < 0x2ffff; r++ {
if k < len(testData) && r == testData[k].r {
d = testData[k]
k++
}
s := string(r)
for j, p := range []Properties{NFC.PropertiesString(s), NFKC.PropertiesString(s)} {
f := d.f[j]
if p.CCC() != d.ccc {
t.Errorf("%U: ccc(%s): was %d; want %d %X", r, CK[j], p.CCC(), d.ccc, p.index)
}
if p.isYesC() != (f.qc == Yes) {
t.Errorf("%U: YesC(%s): was %v; want %v", r, CK[j], p.isYesC(), f.qc == Yes)
}
if p.combinesBackward() != (f.qc == Maybe) {
t.Errorf("%U: combines backwards(%s): was %v; want %v", r, CK[j], p.combinesBackward(), f.qc == Maybe)
}
if p.nLeadingNonStarters() != d.nLead {
t.Errorf("%U: nLead(%s): was %d; want %d %#v %#v", r, CK[j], p.nLeadingNonStarters(), d.nLead, p, d)
}
if p.nTrailingNonStarters() != d.nTrail {
t.Errorf("%U: nTrail(%s): was %d; want %d %#v %#v", r, CK[j], p.nTrailingNonStarters(), d.nTrail, p, d)
}
if p.combinesForward() != f.combinesForward {
t.Errorf("%U: combines forward(%s): was %v; want %v %#v", r, CK[j], p.combinesForward(), f.combinesForward, p)
}
// Skip Hangul as it is algorithmically computed.
if r >= hangulBase && r < hangulEnd {
continue
}
if p.hasDecomposition() {
if has := f.decomposition != ""; !has {
t.Errorf("%U: hasDecomposition(%s): was %v; want %v", r, CK[j], p.hasDecomposition(), has)
}
if string(p.Decomposition()) != f.decomposition {
t.Errorf("%U: decomp(%s): was %+q; want %+q", r, CK[j], p.Decomposition(), f.decomposition)
}
}
}
}
}

View File

@@ -0,0 +1,105 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "unicode/utf8"
type input struct {
str string
bytes []byte
}
func inputBytes(str []byte) input {
return input{bytes: str}
}
func inputString(str string) input {
return input{str: str}
}
func (in *input) setBytes(str []byte) {
in.str = ""
in.bytes = str
}
func (in *input) setString(str string) {
in.str = str
in.bytes = nil
}
func (in *input) _byte(p int) byte {
if in.bytes == nil {
return in.str[p]
}
return in.bytes[p]
}
func (in *input) skipASCII(p, max int) int {
if in.bytes == nil {
for ; p < max && in.str[p] < utf8.RuneSelf; p++ {
}
} else {
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ {
}
}
return p
}
func (in *input) skipContinuationBytes(p int) int {
if in.bytes == nil {
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ {
}
} else {
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ {
}
}
return p
}
func (in *input) appendSlice(buf []byte, b, e int) []byte {
if in.bytes != nil {
return append(buf, in.bytes[b:e]...)
}
for i := b; i < e; i++ {
buf = append(buf, in.str[i])
}
return buf
}
func (in *input) copySlice(buf []byte, b, e int) int {
if in.bytes == nil {
return copy(buf, in.str[b:e])
}
return copy(buf, in.bytes[b:e])
}
func (in *input) charinfoNFC(p int) (uint16, int) {
if in.bytes == nil {
return nfcData.lookupString(in.str[p:])
}
return nfcData.lookup(in.bytes[p:])
}
func (in *input) charinfoNFKC(p int) (uint16, int) {
if in.bytes == nil {
return nfkcData.lookupString(in.str[p:])
}
return nfkcData.lookup(in.bytes[p:])
}
func (in *input) hangul(p int) (r rune) {
if in.bytes == nil {
if !isHangulString(in.str[p:]) {
return 0
}
r, _ = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, _ = utf8.DecodeRune(in.bytes[p:])
}
return r
}

View File

@@ -0,0 +1,450 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"fmt"
"unicode/utf8"
)
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
// sequence of starter and non-starter runes for the purpose of normalization.
const MaxSegmentSize = maxByteBufferSize
// An Iter iterates over a string or byte slice, while normalizing it
// to a given Form.
type Iter struct {
rb reorderBuffer
buf [maxByteBufferSize]byte
info Properties // first character saved from previous iteration
next iterFunc // implementation of next depends on form
asciiF iterFunc
p int // current position in input source
multiSeg []byte // remainder of multi-segment decomposition
}
type iterFunc func(*Iter) []byte
// Init initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) Init(f Form, src []byte) {
i.p = 0
if len(src) == 0 {
i.setDone()
i.rb.nsrc = 0
return
}
i.multiSeg = nil
i.rb.init(f, src)
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIBytes
i.info = i.rb.f.info(i.rb.src, i.p)
}
// InitString initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) InitString(f Form, src string) {
i.p = 0
if len(src) == 0 {
i.setDone()
i.rb.nsrc = 0
return
}
i.multiSeg = nil
i.rb.initString(f, src)
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIString
i.info = i.rb.f.info(i.rb.src, i.p)
}
// Seek sets the segment to be returned by the next call to Next to start
// at position p. It is the responsibility of the caller to set p to the
// start of a UTF8 rune.
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
var abs int64
switch whence {
case 0:
abs = offset
case 1:
abs = int64(i.p) + offset
case 2:
abs = int64(i.rb.nsrc) + offset
default:
return 0, fmt.Errorf("norm: invalid whence")
}
if abs < 0 {
return 0, fmt.Errorf("norm: negative position")
}
if int(abs) >= i.rb.nsrc {
i.setDone()
return int64(i.p), nil
}
i.p = int(abs)
i.multiSeg = nil
i.next = i.rb.f.nextMain
i.info = i.rb.f.info(i.rb.src, i.p)
return abs, nil
}
// returnSlice returns a slice of the underlying input type as a byte slice.
// If the underlying is of type []byte, it will simply return a slice.
// If the underlying is of type string, it will copy the slice to the buffer
// and return that.
func (i *Iter) returnSlice(a, b int) []byte {
if i.rb.src.bytes == nil {
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
}
return i.rb.src.bytes[a:b]
}
// Pos returns the byte position at which the next call to Next will commence processing.
func (i *Iter) Pos() int {
return i.p
}
func (i *Iter) setDone() {
i.next = nextDone
i.p = i.rb.nsrc
}
// Done returns true if there is no more input to process.
func (i *Iter) Done() bool {
return i.p >= i.rb.nsrc
}
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
// For any input a and b for which f(a) == f(b), subsequent calls
// to Next will return the same segments.
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
// Although not guaranteed, n will typically be the smallest possible n.
func (i *Iter) Next() []byte {
return i.next(i)
}
func nextASCIIBytes(i *Iter) []byte {
p := i.p + 1
if p >= i.rb.nsrc {
i.setDone()
return i.rb.src.bytes[i.p:p]
}
if i.rb.src.bytes[p] < utf8.RuneSelf {
p0 := i.p
i.p = p
return i.rb.src.bytes[p0:p]
}
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
func nextASCIIString(i *Iter) []byte {
p := i.p + 1
if p >= i.rb.nsrc {
i.buf[0] = i.rb.src.str[i.p]
i.setDone()
return i.buf[:1]
}
if i.rb.src.str[p] < utf8.RuneSelf {
i.buf[0] = i.rb.src.str[i.p]
i.p = p
return i.buf[:1]
}
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
func nextHangul(i *Iter) []byte {
p := i.p
next := p + hangulUTF8Size
if next >= i.rb.nsrc {
i.setDone()
} else if i.rb.src.hangul(next) == 0 {
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
}
i.p = next
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
}
func nextDone(i *Iter) []byte {
return nil
}
// nextMulti is used for iterating over multi-segment decompositions
// for decomposing normal forms.
func nextMulti(i *Iter) []byte {
j := 0
d := i.multiSeg
// skip first rune
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
}
for j < len(d) {
info := i.rb.f.info(input{bytes: d}, j)
if info.BoundaryBefore() {
i.multiSeg = d[j:]
return d[:j]
}
j += int(info.size)
}
// treat last segment as normal decomposition
i.next = i.rb.f.nextMain
return i.next(i)
}
// nextMultiNorm is used for iterating over multi-segment decompositions
// for composing normal forms.
func nextMultiNorm(i *Iter) []byte {
j := 0
d := i.multiSeg
for j < len(d) {
info := i.rb.f.info(input{bytes: d}, j)
if info.BoundaryBefore() {
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
i.rb.ss.first(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
i.multiSeg = d[j+int(info.size):]
return seg
}
i.rb.ss.next(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
j += int(info.size)
}
i.multiSeg = nil
i.next = nextComposed
return doNormComposed(i)
}
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
func nextDecomposed(i *Iter) (next []byte) {
outp := 0
inCopyStart, outCopyStart := i.p, 0
ss := mkStreamSafe(i.info)
for {
if sz := int(i.info.size); sz <= 1 {
p := i.p
i.p++ // ASCII or illegal byte. Either way, advance by 1.
if i.p >= i.rb.nsrc {
i.setDone()
return i.returnSlice(p, i.p)
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.next = i.asciiF
return i.returnSlice(p, i.p)
}
outp++
} else if d := i.info.Decomposition(); d != nil {
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
// Case 1: there is a leftover to copy. In this case the decomposition
// must begin with a modifier and should always be appended.
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
p := outp + len(d)
if outp > 0 {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
if p > len(i.buf) {
return i.buf[:outp]
}
} else if i.info.multiSegment() {
// outp must be 0 as multi-segment decompositions always
// start a new segment.
if i.multiSeg == nil {
i.multiSeg = d
i.next = nextMulti
return nextMulti(i)
}
// We are in the last segment. Treat as normal decomposition.
d = i.multiSeg
i.multiSeg = nil
p = len(d)
}
prevCC := i.info.tccc
if i.p += sz; i.p >= i.rb.nsrc {
i.setDone()
i.info = Properties{} // Force BoundaryBefore to succeed.
} else {
i.info = i.rb.f.info(i.rb.src, i.p)
}
switch ss.next(i.info) {
case ssOverflow:
i.next = nextCGJDecompose
fallthrough
case ssStarter:
if outp > 0 {
copy(i.buf[outp:], d)
return i.buf[:p]
}
return d
}
copy(i.buf[outp:], d)
outp = p
inCopyStart, outCopyStart = i.p, outp
if i.info.ccc < prevCC {
goto doNorm
}
continue
} else if r := i.rb.src.hangul(i.p); r != 0 {
outp = decomposeHangul(i.buf[:], r)
i.p += hangulUTF8Size
inCopyStart, outCopyStart = i.p, outp
if i.p >= i.rb.nsrc {
i.setDone()
break
} else if i.rb.src.hangul(i.p) != 0 {
i.next = nextHangul
return i.buf[:outp]
}
} else {
p := outp + sz
if p > len(i.buf) {
break
}
outp = p
i.p += sz
}
if i.p >= i.rb.nsrc {
i.setDone()
break
}
prevCC := i.info.tccc
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJDecompose
break
}
if i.info.ccc < prevCC {
goto doNorm
}
}
if outCopyStart == 0 {
return i.returnSlice(inCopyStart, i.p)
} else if inCopyStart < i.p {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
}
return i.buf[:outp]
doNorm:
// Insert what we have decomposed so far in the reorderBuffer.
// As we will only reorder, there will always be enough room.
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
i.rb.insertDecomposed(i.buf[0:outp])
return doNormDecomposed(i)
}
func doNormDecomposed(i *Iter) []byte {
for {
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if i.info.ccc == 0 {
break
}
}
// new segment or too many combining characters: exit normalization
return i.buf[:i.rb.flushCopy(i.buf[:])]
}
func nextCGJDecompose(i *Iter) []byte {
i.rb.ss = 0
i.rb.insertCGJ()
i.next = nextDecomposed
buf := doNormDecomposed(i)
return buf
}
// nextComposed is the implementation of Next for forms NFC and NFKC.
func nextComposed(i *Iter) []byte {
outp, startp := 0, i.p
var prevCC uint8
ss := mkStreamSafe(i.info)
for {
if !i.info.isYesC() {
goto doNorm
}
prevCC = i.info.tccc
sz := int(i.info.size)
if sz == 0 {
sz = 1 // illegal rune: copy byte-by-byte
}
p := outp + sz
if p > len(i.buf) {
break
}
outp = p
i.p += sz
if i.p >= i.rb.nsrc {
i.setDone()
break
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.next = i.asciiF
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJCompose
break
}
if i.info.ccc < prevCC {
goto doNorm
}
}
return i.returnSlice(startp, i.p)
doNorm:
i.p = startp
i.info = i.rb.f.info(i.rb.src, i.p)
if i.info.multiSegment() {
d := i.info.Decomposition()
info := i.rb.f.info(input{bytes: d}, 0)
i.rb.insertUnsafe(input{bytes: d}, 0, info)
i.multiSeg = d[int(info.size):]
i.next = nextMultiNorm
return nextMultiNorm(i)
}
i.rb.ss.first(i.info)
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
return doNormComposed(i)
}
func doNormComposed(i *Iter) []byte {
// First rune should already be inserted.
for {
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if s := i.rb.ss.next(i.info); s == ssStarter {
break
} else if s == ssOverflow {
i.next = nextCGJCompose
break
}
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
}
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
return seg
}
func nextCGJCompose(i *Iter) []byte {
i.rb.ss = 0 // instead of first
i.rb.insertCGJ()
i.next = nextComposed
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
// If we ever change that, insert a check here.
i.rb.ss.first(i.info)
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
return doNormComposed(i)
}

View File

@@ -0,0 +1,98 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"strings"
"testing"
)
func doIterNorm(f Form, s string) []byte {
acc := []byte{}
i := Iter{}
i.InitString(f, s)
for !i.Done() {
acc = append(acc, i.Next()...)
}
return acc
}
func TestIterNext(t *testing.T) {
runNormTests(t, "IterNext", func(f Form, out []byte, s string) []byte {
return doIterNorm(f, string(append(out, s...)))
})
}
type SegmentTest struct {
in string
out []string
}
var segmentTests = []SegmentTest{
{"\u1E0A\u0323a", []string{"\x44\u0323\u0307", "a", ""}},
{rep('a', segSize), append(strings.Split(rep('a', segSize), ""), "")},
{rep('a', segSize+2), append(strings.Split(rep('a', segSize+2), ""), "")},
{rep('a', segSize) + "\u0300aa",
append(strings.Split(rep('a', segSize-1), ""), "a\u0300", "a", "a", "")},
// U+0f73 is NOT treated as a starter as it is a modifier
{"a" + grave(29) + "\u0f73", []string{"a" + grave(29), cgj + "\u0f73"}},
{"a\u0f73", []string{"a\u0f73"}},
// U+ff9e is treated as a non-starter.
// TODO: should we? Note that this will only affect iteration, as whether
// or not we do so does not affect the normalization output and will either
// way result in consistent iteration output.
{"a" + grave(30) + "\uff9e", []string{"a" + grave(30), cgj + "\uff9e"}},
{"a\uff9e", []string{"a\uff9e"}},
}
var segmentTestsK = []SegmentTest{
{"\u3332", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u3099", ""}},
// last segment of multi-segment decomposition needs normalization
{"\u3332\u093C", []string{"\u30D5", "\u30A1", "\u30E9", "\u30C3", "\u30C8\u093C\u3099", ""}},
{"\u320E", []string{"\x28", "\uAC00", "\x29"}},
// last segment should be copied to start of buffer.
{"\ufdfa", []string{"\u0635", "\u0644", "\u0649", " ", "\u0627", "\u0644", "\u0644", "\u0647", " ", "\u0639", "\u0644", "\u064a", "\u0647", " ", "\u0648", "\u0633", "\u0644", "\u0645", ""}},
{"\ufdfa" + grave(30), []string{"\u0635", "\u0644", "\u0649", " ", "\u0627", "\u0644", "\u0644", "\u0647", " ", "\u0639", "\u0644", "\u064a", "\u0647", " ", "\u0648", "\u0633", "\u0644", "\u0645" + grave(30), ""}},
{"\uFDFA" + grave(64), []string{"\u0635", "\u0644", "\u0649", " ", "\u0627", "\u0644", "\u0644", "\u0647", " ", "\u0639", "\u0644", "\u064a", "\u0647", " ", "\u0648", "\u0633", "\u0644", "\u0645" + grave(30), cgj + grave(30), cgj + grave(4), ""}},
// Hangul and Jamo are grouped togeter.
{"\uAC00", []string{"\u1100\u1161", ""}},
{"\uAC01", []string{"\u1100\u1161\u11A8", ""}},
{"\u1100\u1161", []string{"\u1100\u1161", ""}},
}
// Note that, by design, segmentation is equal for composing and decomposing forms.
func TestIterSegmentation(t *testing.T) {
segmentTest(t, "SegmentTestD", NFD, segmentTests)
segmentTest(t, "SegmentTestC", NFC, segmentTests)
segmentTest(t, "SegmentTestKD", NFKD, segmentTestsK)
segmentTest(t, "SegmentTestKC", NFKC, segmentTestsK)
}
func segmentTest(t *testing.T, name string, f Form, tests []SegmentTest) {
iter := Iter{}
for i, tt := range tests {
iter.InitString(f, tt.in)
for j, seg := range tt.out {
if seg == "" {
if !iter.Done() {
res := string(iter.Next())
t.Errorf(`%s:%d:%d: expected Done()==true, found segment %+q`, name, i, j, res)
}
continue
}
if iter.Done() {
t.Errorf("%s:%d:%d: Done()==true, want false", name, i, j)
}
seg = f.String(seg)
if res := string(iter.Next()); res != seg {
t.Errorf(`%s:%d:%d" segment was %+q (%d); want %+q (%d)`, name, i, j, pc(res), len(res), pc(seg), len(seg))
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm_test
import (
"testing"
)
func TestPlaceHolder(t *testing.T) {
// Does nothing, just allows the Makefile to be canonical
// while waiting for the package itself to be written.
}

View File

@@ -0,0 +1,524 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package norm contains types and functions for normalizing Unicode strings.
package norm
import "unicode/utf8"
// A Form denotes a canonical representation of Unicode code points.
// The Unicode-defined normalization and equivalence forms are:
//
// NFC Unicode Normalization Form C
// NFD Unicode Normalization Form D
// NFKC Unicode Normalization Form KC
// NFKD Unicode Normalization Form KD
//
// For a Form f, this documentation uses the notation f(x) to mean
// the bytes or string x converted to the given form.
// A position n in x is called a boundary if conversion to the form can
// proceed independently on both sides:
// f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: http://unicode.org/reports/tr15/ and
// http://unicode.org/notes/tn5/.
type Form int
const (
NFC Form = iota
NFD
NFKC
NFKD
)
// Bytes returns f(b). May return b if f(b) = b.
func (f Form) Bytes(b []byte) []byte {
src := inputBytes(b)
ft := formTable[f]
n, ok := ft.quickSpan(src, 0, len(b), true)
if ok {
return b
}
out := make([]byte, n, len(b))
copy(out, b[0:n])
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
return doAppendInner(&rb, n)
}
// String returns f(s).
func (f Form) String(s string) string {
src := inputString(s)
ft := formTable[f]
n, ok := ft.quickSpan(src, 0, len(s), true)
if ok {
return s
}
out := make([]byte, n, len(s))
copy(out, s[0:n])
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
return string(doAppendInner(&rb, n))
}
// IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool {
src := inputBytes(b)
ft := formTable[f]
bp, ok := ft.quickSpan(src, 0, len(b), true)
if ok {
return true
}
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
rb.setFlusher(nil, cmpNormalBytes)
for bp < len(b) {
rb.out = b[bp:]
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
return false
}
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
}
return true
}
func cmpNormalBytes(rb *reorderBuffer) bool {
b := rb.out
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if int(info.size) > len(b) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if b[0] != rb.byte[p] {
return false
}
b = b[1:]
}
}
return true
}
// IsNormalString returns true if s == f(s).
func (f Form) IsNormalString(s string) bool {
src := inputString(s)
ft := formTable[f]
bp, ok := ft.quickSpan(src, 0, len(s), true)
if ok {
return true
}
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
rb.setFlusher(nil, func(rb *reorderBuffer) bool {
for i := 0; i < rb.nrune; i++ {
info := rb.rune[i]
if bp+int(info.size) > len(s) {
return false
}
p := info.pos
pe := p + info.size
for ; p < pe; p++ {
if s[bp] != rb.byte[p] {
return false
}
bp++
}
}
return true
})
for bp < len(s) {
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
return false
}
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
}
return true
}
// patchTail fixes a case where a rune may be incorrectly normalized
// if it is followed by illegal continuation bytes. It returns the
// patched buffer and whether the decomposition is still in progress.
func patchTail(rb *reorderBuffer) bool {
info, p := lastRuneStart(&rb.f, rb.out)
if p == -1 || info.size == 0 {
return true
}
end := p + int(info.size)
extra := len(rb.out) - end
if extra > 0 {
// Potentially allocating memory. However, this only
// happens with ill-formed UTF-8.
x := make([]byte, 0)
x = append(x, rb.out[len(rb.out)-extra:]...)
rb.out = rb.out[:end]
decomposeToLastBoundary(rb)
rb.doFlush()
rb.out = append(rb.out, x...)
return false
}
buf := rb.out[p:]
rb.out = rb.out[:p]
decomposeToLastBoundary(rb)
if s := rb.ss.next(info); s == ssStarter {
rb.doFlush()
rb.ss.first(info)
} else if s == ssOverflow {
rb.doFlush()
rb.insertCGJ()
rb.ss = 0
}
rb.insertUnsafe(inputBytes(buf), 0, info)
return true
}
func appendQuick(rb *reorderBuffer, i int) int {
if rb.nsrc == i {
return i
}
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
rb.out = rb.src.appendSlice(rb.out, i, end)
return end
}
// Append returns f(append(out, b...)).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) Append(out []byte, src ...byte) []byte {
return f.doAppend(out, inputBytes(src), len(src))
}
func (f Form) doAppend(out []byte, src input, n int) []byte {
if n == 0 {
return out
}
ft := formTable[f]
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
if len(out) == 0 {
p, _ := ft.quickSpan(src, 0, n, true)
out = src.appendSlice(out, 0, p)
if p == n {
return out
}
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
return doAppendInner(&rb, p)
}
rb := reorderBuffer{f: *ft, src: src, nsrc: n}
return doAppend(&rb, out, 0)
}
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
rb.setFlusher(out, appendFlush)
src, n := rb.src, rb.nsrc
doMerge := len(out) > 0
if q := src.skipContinuationBytes(p); q > p {
// Move leading non-starters to destination.
rb.out = src.appendSlice(rb.out, p, q)
p = q
doMerge = patchTail(rb)
}
fd := &rb.f
if doMerge {
var info Properties
if p < n {
info = fd.info(src, p)
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
if p == 0 {
decomposeToLastBoundary(rb)
}
p = decomposeSegment(rb, p, true)
}
}
if info.size == 0 {
rb.doFlush()
// Append incomplete UTF-8 encoding.
return src.appendSlice(rb.out, p, n)
}
if rb.nrune > 0 {
return doAppendInner(rb, p)
}
}
p = appendQuick(rb, p)
return doAppendInner(rb, p)
}
func doAppendInner(rb *reorderBuffer, p int) []byte {
for n := rb.nsrc; p < n; {
p = decomposeSegment(rb, p, true)
p = appendQuick(rb, p)
}
return rb.out
}
// AppendString returns f(append(out, []byte(s))).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, src string) []byte {
return f.doAppend(out, inputString(src), len(src))
}
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int {
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
return n
}
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
// whether any non-normalized parts were found. If atEOF is false, n will
// not point past the last segment if this segment might be become
// non-normalized by appending other runes.
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
var lastCC uint8
ss := streamSafe(0)
lastSegStart := i
for n = end; i < n; {
if j := src.skipASCII(i, n); i != j {
i = j
lastSegStart = i - 1
lastCC = 0
ss = 0
continue
}
info := f.info(src, i)
if info.size == 0 {
if atEOF {
// include incomplete runes
return n, true
}
return lastSegStart, true
}
// This block needs to be before the next, because it is possible to
// have an overflow for runes that are starters (e.g. with U+FF9E).
switch ss.next(info) {
case ssStarter:
ss.first(info)
lastSegStart = i
case ssOverflow:
return lastSegStart, false
case ssSuccess:
if lastCC > info.ccc {
return lastSegStart, false
}
}
if f.composing {
if !info.isYesC() {
break
}
} else {
if !info.isYesD() {
break
}
}
lastCC = info.ccc
i += int(info.size)
}
if i == n {
if !atEOF {
n = lastSegStart
}
return n, true
}
return lastSegStart, false
}
// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpanString(s string) int {
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
return n
}
// FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int {
return f.firstBoundary(inputBytes(b), len(b))
}
func (f Form) firstBoundary(src input, nsrc int) int {
i := src.skipContinuationBytes(0)
if i >= nsrc {
return -1
}
fd := formTable[f]
ss := streamSafe(0)
// We should call ss.first here, but we can't as the first rune is
// skipped already. This means FirstBoundary can't really determine
// CGJ insertion points correctly. Luckily it doesn't have to.
// TODO: consider adding NextBoundary
for {
info := fd.info(src, i)
if info.size == 0 {
return -1
}
if s := ss.next(info); s != ssSuccess {
return i
}
i += int(info.size)
if i >= nsrc {
if !info.BoundaryAfter() && !ss.isMax() {
return -1
}
return nsrc
}
}
}
// FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) int {
return f.firstBoundary(inputString(s), len(s))
}
// LastBoundary returns the position i of the last boundary in b
// or -1 if b contains no boundary.
func (f Form) LastBoundary(b []byte) int {
return lastBoundary(formTable[f], b)
}
func lastBoundary(fd *formInfo, b []byte) int {
i := len(b)
info, p := lastRuneStart(fd, b)
if p == -1 {
return -1
}
if info.size == 0 { // ends with incomplete rune
if p == 0 { // starts with incomplete rune
return -1
}
i = p
info, p = lastRuneStart(fd, b[:i])
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
return i
}
}
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
return i
}
if info.BoundaryAfter() {
return i
}
ss := streamSafe(0)
v := ss.backwards(info)
for i = p; i >= 0 && v != ssStarter; i = p {
info, p = lastRuneStart(fd, b[:i])
if v = ss.backwards(info); v == ssOverflow {
break
}
if p+int(info.size) != i {
if p == -1 { // no boundary found
return -1
}
return i // boundary after an illegal UTF-8 encoding
}
}
return i
}
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
// Force one character to be consumed.
info := rb.f.info(rb.src, sp)
if info.size == 0 {
return 0
}
if rb.nrune > 0 {
if s := rb.ss.next(info); s == ssStarter {
goto end
} else if s == ssOverflow {
rb.insertCGJ()
goto end
}
} else {
rb.ss.first(info)
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
}
for {
sp += int(info.size)
if sp >= rb.nsrc {
if !atEOF && !info.BoundaryAfter() {
return int(iShortSrc)
}
break
}
info = rb.f.info(rb.src, sp)
if info.size == 0 {
if !atEOF {
return int(iShortSrc)
}
break
}
if s := rb.ss.next(info); s == ssStarter {
break
} else if s == ssOverflow {
rb.insertCGJ()
break
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
}
}
end:
if !rb.doFlush() {
return int(iShortDst)
}
return sp
}
// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
p := len(buf) - 1
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
}
if p < 0 {
return Properties{}, -1
}
return fd.info(inputBytes(buf), p), p
}
// decomposeToLastBoundary finds an open segment at the end of the buffer
// and scans it into rb. Returns the buffer minus the last segment.
func decomposeToLastBoundary(rb *reorderBuffer) {
fd := &rb.f
info, i := lastRuneStart(fd, rb.out)
if int(info.size) != len(rb.out)-i {
// illegal trailing continuation bytes
return
}
if info.BoundaryAfter() {
return
}
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
padd := 0
ss := streamSafe(0)
p := len(rb.out)
for {
add[padd] = info
v := ss.backwards(info)
if v == ssOverflow {
// Note that if we have an overflow, it the string we are appending to
// is not correctly normalized. In this case the behavior is undefined.
break
}
padd++
p -= int(info.size)
if v == ssStarter || p < 0 {
break
}
info, i = lastRuneStart(fd, rb.out[:p])
if int(info.size) != p-i {
break
}
}
rb.ss = ss
// Copy bytes for insertion as we may need to overwrite rb.out.
var buf [maxBufferSize * utf8.UTFMax]byte
cp := buf[:copy(buf[:], rb.out[p:])]
rb.out = rb.out[:p]
for padd--; padd >= 0; padd-- {
info = add[padd]
rb.insertUnsafe(inputBytes(cp), 0, info)
cp = cp[info.size:]
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,318 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"bytes"
"flag"
"fmt"
"log"
"net/http"
"os"
"path"
"regexp"
"runtime"
"strconv"
"strings"
"time"
"unicode"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
func main() {
flag.Parse()
loadTestData()
CharacterByCharacterTests()
StandardTests()
PerformanceTest()
if errorCount == 0 {
fmt.Println("PASS")
}
}
const file = "NormalizationTest.txt"
var url = flag.String("url",
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/"+file,
"URL of Unicode database directory")
var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
var logger = log.New(os.Stderr, "", log.Lshortfile)
// This regression test runs the test set in NormalizationTest.txt
// (taken from http://www.unicode.org/Public/<unicode.Version>/ucd/).
//
// NormalizationTest.txt has form:
// @Part0 # Specific cases
// #
// 1E0A;1E0A;0044 0307;1E0A;0044 0307; # (Ḋ; Ḋ; D◌̇; Ḋ; D◌̇; ) LATIN CAPITAL LETTER D WITH DOT ABOVE
// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # (Ḍ; Ḍ; D◌̣; Ḍ; D◌̣; ) LATIN CAPITAL LETTER D WITH DOT BELOW
//
// Each test has 5 columns (c1, c2, c3, c4, c5), where
// (c1, c2, c3, c4, c5) == (c1, NFC(c1), NFD(c1), NFKC(c1), NFKD(c1))
//
// CONFORMANCE:
// 1. The following invariants must be true for all conformant implementations
//
// NFC
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
// c4 == NFC(c4) == NFC(c5)
//
// NFD
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
// c5 == NFD(c4) == NFD(c5)
//
// NFKC
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
//
// NFKD
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
//
// 2. For every code point X assigned in this version of Unicode that is not
// specifically listed in Part 1, the following invariants must be true
// for all conformant implementations:
//
// X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
//
// Column types.
const (
cRaw = iota
cNFC
cNFD
cNFKC
cNFKD
cMaxColumns
)
// Holds data from NormalizationTest.txt
var part []Part
type Part struct {
name string
number int
tests []Test
}
type Test struct {
name string
partnr int
number int
r rune // used for character by character test
cols [cMaxColumns]string // Each has 5 entries, see below.
}
func (t Test) Name() string {
if t.number < 0 {
return part[t.partnr].name
}
return fmt.Sprintf("%s:%d", part[t.partnr].name, t.number)
}
var partRe = regexp.MustCompile(`@Part(\d) # (.*)$`)
var testRe = regexp.MustCompile(`^` + strings.Repeat(`([\dA-F ]+);`, 5) + ` # (.*)$`)
var counter int
// Load the data form NormalizationTest.txt
func loadTestData() {
if *localFiles {
pwd, _ := os.Getwd()
*url = "file://" + path.Join(pwd, file)
}
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
c := &http.Client{Transport: t}
resp, err := c.Get(*url)
if err != nil {
logger.Fatal(err)
}
if resp.StatusCode != 200 {
logger.Fatal("bad GET status for "+file, resp.Status)
}
f := resp.Body
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 || line[0] == '#' {
continue
}
m := partRe.FindStringSubmatch(line)
if m != nil {
if len(m) < 3 {
logger.Fatal("Failed to parse Part: ", line)
}
i, err := strconv.Atoi(m[1])
if err != nil {
logger.Fatal(err)
}
name := m[2]
part = append(part, Part{name: name[:len(name)-1], number: i})
continue
}
m = testRe.FindStringSubmatch(line)
if m == nil || len(m) < 7 {
logger.Fatalf(`Failed to parse: "%s" result: %#v`, line, m)
}
test := Test{name: m[6], partnr: len(part) - 1, number: counter}
counter++
for j := 1; j < len(m)-1; j++ {
for _, split := range strings.Split(m[j], " ") {
r, err := strconv.ParseUint(split, 16, 64)
if err != nil {
logger.Fatal(err)
}
if test.r == 0 {
// save for CharacterByCharacterTests
test.r = rune(r)
}
var buf [utf8.UTFMax]byte
sz := utf8.EncodeRune(buf[:], rune(r))
test.cols[j-1] += string(buf[:sz])
}
}
part := &part[len(part)-1]
part.tests = append(part.tests, test)
}
if scanner.Err() != nil {
logger.Fatal(scanner.Err())
}
}
var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
var errorCount int
func cmpResult(t *Test, name string, f norm.Form, gold, test, result string) {
if gold != result {
errorCount++
if errorCount > 20 {
return
}
logger.Printf("%s:%s: %s(%+q)=%+q; want %+q: %s",
t.Name(), name, fstr[f], test, result, gold, t.name)
}
}
func cmpIsNormal(t *Test, name string, f norm.Form, test string, result, want bool) {
if result != want {
errorCount++
if errorCount > 20 {
return
}
logger.Printf("%s:%s: %s(%+q)=%v; want %v", t.Name(), name, fstr[f], test, result, want)
}
}
func doTest(t *Test, f norm.Form, gold, test string) {
testb := []byte(test)
result := f.Bytes(testb)
cmpResult(t, "Bytes", f, gold, test, string(result))
sresult := f.String(test)
cmpResult(t, "String", f, gold, test, sresult)
acc := []byte{}
i := norm.Iter{}
i.InitString(f, test)
for !i.Done() {
acc = append(acc, i.Next()...)
}
cmpResult(t, "Iter.Next", f, gold, test, string(acc))
buf := make([]byte, 128)
acc = nil
for p := 0; p < len(testb); {
nDst, nSrc, _ := f.Transform(buf, testb[p:], true)
acc = append(acc, buf[:nDst]...)
p += nSrc
}
cmpResult(t, "Transform", f, gold, test, string(acc))
for i := range test {
out := f.Append(f.Bytes([]byte(test[:i])), []byte(test[i:])...)
cmpResult(t, fmt.Sprintf(":Append:%d", i), f, gold, test, string(out))
}
cmpIsNormal(t, "IsNormal", f, test, f.IsNormal([]byte(test)), test == gold)
cmpIsNormal(t, "IsNormalString", f, test, f.IsNormalString(test), test == gold)
}
func doConformanceTests(t *Test, partn int) {
for i := 0; i <= 2; i++ {
doTest(t, norm.NFC, t.cols[1], t.cols[i])
doTest(t, norm.NFD, t.cols[2], t.cols[i])
doTest(t, norm.NFKC, t.cols[3], t.cols[i])
doTest(t, norm.NFKD, t.cols[4], t.cols[i])
}
for i := 3; i <= 4; i++ {
doTest(t, norm.NFC, t.cols[3], t.cols[i])
doTest(t, norm.NFD, t.cols[4], t.cols[i])
doTest(t, norm.NFKC, t.cols[3], t.cols[i])
doTest(t, norm.NFKD, t.cols[4], t.cols[i])
}
}
func CharacterByCharacterTests() {
tests := part[1].tests
var last rune = 0
for i := 0; i <= len(tests); i++ { // last one is special case
var r rune
if i == len(tests) {
r = 0x2FA1E // Don't have to go to 0x10FFFF
} else {
r = tests[i].r
}
for last++; last < r; last++ {
// Check all characters that were not explicitly listed in the test.
t := &Test{partnr: 1, number: -1}
char := string(last)
doTest(t, norm.NFC, char, char)
doTest(t, norm.NFD, char, char)
doTest(t, norm.NFKC, char, char)
doTest(t, norm.NFKD, char, char)
}
if i < len(tests) {
doConformanceTests(&tests[i], 1)
}
}
}
func StandardTests() {
for _, j := range []int{0, 2, 3} {
for _, test := range part[j].tests {
doConformanceTests(&test, j)
}
}
}
// PerformanceTest verifies that normalization is O(n). If any of the
// code does not properly check for maxCombiningChars, normalization
// may exhibit O(n**2) behavior.
func PerformanceTest() {
runtime.GOMAXPROCS(2)
success := make(chan bool, 1)
go func() {
buf := bytes.Repeat([]byte("\u035D"), 1024*1024)
buf = append(buf, "\u035B"...)
norm.NFC.Append(nil, buf...)
success <- true
}()
timeout := time.After(1 * time.Second)
select {
case <-success:
// test completed before the timeout
case <-timeout:
errorCount++
logger.Printf(`unexpectedly long time to complete PerformanceTest`)
}
}

View File

@@ -0,0 +1,126 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import "io"
type normWriter struct {
rb reorderBuffer
w io.Writer
buf []byte
}
// Write implements the standard write interface. If the last characters are
// not at a normalization boundary, the bytes will be buffered for the next
// write. The remaining bytes will be written on close.
func (w *normWriter) Write(data []byte) (n int, err error) {
// Process data in pieces to keep w.buf size bounded.
const chunk = 4000
for len(data) > 0 {
// Normalize into w.buf.
m := len(data)
if m > chunk {
m = chunk
}
w.rb.src = inputBytes(data[:m])
w.rb.nsrc = m
w.buf = doAppend(&w.rb, w.buf, 0)
data = data[m:]
n += m
// Write out complete prefix, save remainder.
// Note that lastBoundary looks back at most 31 runes.
i := lastBoundary(&w.rb.f, w.buf)
if i == -1 {
i = 0
}
if i > 0 {
if _, err = w.w.Write(w.buf[:i]); err != nil {
break
}
bn := copy(w.buf, w.buf[i:])
w.buf = w.buf[:bn]
}
}
return n, err
}
// Close forces data that remains in the buffer to be written.
func (w *normWriter) Close() error {
if len(w.buf) > 0 {
_, err := w.w.Write(w.buf)
if err != nil {
return err
}
}
return nil
}
// Writer returns a new writer that implements Write(b)
// by writing f(b) to w. The returned writer may use an
// an internal buffer to maintain state across Write calls.
// Calling its Close method writes any buffered data to w.
func (f Form) Writer(w io.Writer) io.WriteCloser {
wr := &normWriter{rb: reorderBuffer{}, w: w}
wr.rb.init(f, nil)
return wr
}
type normReader struct {
rb reorderBuffer
r io.Reader
inbuf []byte
outbuf []byte
bufStart int
lastBoundary int
err error
}
// Read implements the standard read interface.
func (r *normReader) Read(p []byte) (int, error) {
for {
if r.lastBoundary-r.bufStart > 0 {
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary])
r.bufStart += n
if r.lastBoundary-r.bufStart > 0 {
return n, nil
}
return n, r.err
}
if r.err != nil {
return 0, r.err
}
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:])
r.outbuf = r.outbuf[0:outn]
r.bufStart = 0
n, err := r.r.Read(r.inbuf)
r.rb.src = inputBytes(r.inbuf[0:n])
r.rb.nsrc, r.err = n, err
if n > 0 {
r.outbuf = doAppend(&r.rb, r.outbuf, 0)
}
if err == io.EOF {
r.lastBoundary = len(r.outbuf)
} else {
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf)
if r.lastBoundary == -1 {
r.lastBoundary = 0
}
}
}
panic("should not reach here")
}
// Reader returns a new reader that implements Read
// by reading data from r and returning f(data).
func (f Form) Reader(r io.Reader) io.Reader {
const chunk = 4000
buf := make([]byte, chunk)
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf}
rr.rb.init(f, buf)
return rr
}

View File

@@ -0,0 +1,56 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"bytes"
"fmt"
"testing"
)
var bufSizes = []int{1, 2, 3, 4, 5, 6, 7, 8, 100, 101, 102, 103, 4000, 4001, 4002, 4003}
func readFunc(size int) appendFunc {
return func(f Form, out []byte, s string) []byte {
out = append(out, s...)
r := f.Reader(bytes.NewBuffer(out))
buf := make([]byte, size)
result := []byte{}
for n, err := 0, error(nil); err == nil; {
n, err = r.Read(buf)
result = append(result, buf[:n]...)
}
return result
}
}
func TestReader(t *testing.T) {
for _, s := range bufSizes {
name := fmt.Sprintf("TestReader%d", s)
runNormTests(t, name, readFunc(s))
}
}
func writeFunc(size int) appendFunc {
return func(f Form, out []byte, s string) []byte {
in := append(out, s...)
result := new(bytes.Buffer)
w := f.Writer(result)
buf := make([]byte, size)
for n := 0; len(in) > 0; in = in[n:] {
n = copy(buf, in)
_, _ = w.Write(buf[:n])
}
w.Close()
return result.Bytes()
}
}
func TestWriter(t *testing.T) {
for _, s := range bufSizes {
name := fmt.Sprintf("TestWriter%d", s)
runNormTests(t, name, writeFunc(s))
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,88 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Reset implements the Reset method of the transform.Transformer interface.
func (Form) Reset() {}
// Transform implements the Transform method of the transform.Transformer
// interface. It may need to write segments of up to MaxSegmentSize at once.
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
// least of size MaxTransformChunkSize to be guaranteed of progress.
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := 0
// Cap the maximum number of src bytes to check.
b := src
eof := atEOF
if ns := len(dst); ns < len(b) {
err = transform.ErrShortDst
eof = false
b = b[:ns]
}
i, ok := formTable[f].quickSpan(inputBytes(b), n, len(b), eof)
n += copy(dst[n:], b[n:i])
if !ok {
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF)
return nDst + n, nSrc + n, err
}
if n < len(src) && !atEOF {
err = transform.ErrShortSrc
}
return n, n, err
}
func flushTransform(rb *reorderBuffer) bool {
// Write out (must fully fit in dst, or else it is a ErrShortDst).
if len(rb.out) < rb.nrune*utf8.UTFMax {
return false
}
rb.out = rb.out[rb.flushCopy(rb.out):]
return true
}
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc}
// transform implements the transform.Transformer interface. It is only called
// when quickSpan does not pass for a given string.
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
// TODO: get rid of reorderBuffer. See CL 23460044.
rb := reorderBuffer{}
rb.init(f, src)
for {
// Load segment into reorder buffer.
rb.setFlusher(dst[nDst:], flushTransform)
end := decomposeSegment(&rb, nSrc, atEOF)
if end < 0 {
return nDst, nSrc, errs[-end]
}
nDst = len(dst) - len(rb.out)
nSrc = end
// Next quickSpan.
end = rb.nsrc
eof := atEOF
if n := nSrc + len(dst) - nDst; n < end {
err = transform.ErrShortDst
end = n
eof = false
}
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof)
n := copy(dst[nDst:], rb.src.bytes[nSrc:end])
nSrc += n
nDst += n
if ok {
if n < rb.nsrc && !atEOF {
err = transform.ErrShortSrc
}
return nDst, nSrc, err
}
}
}

View File

@@ -0,0 +1,101 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import (
"fmt"
"testing"
"golang.org/x/text/transform"
)
func TestTransform(t *testing.T) {
tests := []struct {
f Form
in, out string
eof bool
dstSize int
err error
}{
{NFC, "ab", "ab", true, 2, nil},
{NFC, "qx", "qx", true, 2, nil},
{NFD, "qx", "qx", true, 2, nil},
{NFC, "", "", true, 1, nil},
{NFD, "", "", true, 1, nil},
{NFC, "", "", false, 1, nil},
{NFD, "", "", false, 1, nil},
// Normalized segment does not fit in destination.
{NFD, "ö", "", true, 1, transform.ErrShortDst},
{NFD, "ö", "", true, 2, transform.ErrShortDst},
// As an artifact of the algorithm, only full segments are written.
// This is not strictly required, and some bytes could be written.
// In practice, for Transform to not block, the destination buffer
// should be at least MaxSegmentSize to work anyway and these edge
// conditions will be relatively rare.
{NFC, "ab", "", true, 1, transform.ErrShortDst},
// This is even true for inert runes.
{NFC, "qx", "", true, 1, transform.ErrShortDst},
{NFC, "a\u0300abc", "\u00e0a", true, 4, transform.ErrShortDst},
// We cannot write a segment if succesive runes could still change the result.
{NFD, "ö", "", false, 3, transform.ErrShortSrc},
{NFC, "a\u0300", "", false, 4, transform.ErrShortSrc},
{NFD, "a\u0300", "", false, 4, transform.ErrShortSrc},
{NFC, "ö", "", false, 3, transform.ErrShortSrc},
{NFC, "a\u0300", "", true, 1, transform.ErrShortDst},
// Theoretically could fit, but won't due to simplified checks.
{NFC, "a\u0300", "", true, 2, transform.ErrShortDst},
{NFC, "a\u0300", "", true, 3, transform.ErrShortDst},
{NFC, "a\u0300", "\u00e0", true, 4, nil},
{NFD, "öa\u0300", "o\u0308", false, 8, transform.ErrShortSrc},
{NFD, "öa\u0300ö", "o\u0308a\u0300", true, 8, transform.ErrShortDst},
{NFD, "öa\u0300ö", "o\u0308a\u0300", false, 12, transform.ErrShortSrc},
// Illegal input is copied verbatim.
{NFD, "\xbd\xb2=\xbc ", "\xbd\xb2=\xbc ", true, 8, nil},
}
b := make([]byte, 100)
for i, tt := range tests {
nDst, _, err := tt.f.Transform(b[:tt.dstSize], []byte(tt.in), tt.eof)
out := string(b[:nDst])
if out != tt.out || err != tt.err {
t.Errorf("%d: was %+q (%v); want %+q (%v)", i, out, err, tt.out, tt.err)
}
if want := tt.f.String(tt.in)[:nDst]; want != out {
t.Errorf("%d: incorect normalization: was %+q; want %+q", i, out, want)
}
}
}
var transBufSizes = []int{
MaxTransformChunkSize,
3 * MaxTransformChunkSize / 2,
2 * MaxTransformChunkSize,
3 * MaxTransformChunkSize,
100 * MaxTransformChunkSize,
}
func doTransNorm(f Form, buf []byte, b []byte) []byte {
acc := []byte{}
for p := 0; p < len(b); {
nd, ns, _ := f.Transform(buf[:], b[p:], true)
p += ns
acc = append(acc, buf[:nd]...)
}
return acc
}
func TestTransformNorm(t *testing.T) {
for _, sz := range transBufSizes {
buf := make([]byte, sz)
runNormTests(t, fmt.Sprintf("Transform:%d", sz), func(f Form, out []byte, s string) []byte {
return doTransNorm(f, buf, append(out, s...))
})
}
}

View File

@@ -0,0 +1,54 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
type valueRange struct {
value uint16 // header: value:stride
lo, hi byte // header: lo:n
}
type sparseBlocks struct {
values []valueRange
offset []uint16
}
var nfcSparse = sparseBlocks{
values: nfcSparseValues[:],
offset: nfcSparseOffset[:],
}
var nfkcSparse = sparseBlocks{
values: nfkcSparseValues[:],
offset: nfkcSparseOffset[:],
}
var (
nfcData = newNfcTrie(0)
nfkcData = newNfkcTrie(0)
)
// lookupValue determines the type of block n and looks up the value for b.
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
// is a list of ranges with an accompanying value. Given a matching range r,
// the value for b is by r.value + (b - r.lo) * stride.
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 {
offset := t.offset[n]
header := t.values[offset]
lo := offset + 1
hi := lo + uint16(header.lo)
for lo < hi {
m := lo + (hi-lo)/2
r := t.values[m]
if r.lo <= b && b <= r.hi {
return r.value + uint16(b-r.lo)*header.value
}
if b < r.lo {
hi = m
} else {
lo = m + 1
}
}
return 0
}

View File

@@ -0,0 +1,117 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Trie table generator.
// Used by make*tables tools to generate a go file with trie data structures
// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
// sequence are used to lookup offsets in the index table to be used for the
// next byte. The last byte is used to index into a table with 16-bit values.
package main
import (
"fmt"
"io"
)
const maxSparseEntries = 16
type normCompacter struct {
sparseBlocks [][]uint64
sparseOffset []uint16
sparseCount int
name string
}
func mostFrequentStride(a []uint64) int {
counts := make(map[int]int)
var v int
for _, x := range a {
if stride := int(x) - v; v != 0 && stride >= 0 {
counts[stride]++
}
v = int(x)
}
var maxs, maxc int
for stride, cnt := range counts {
if cnt > maxc || (cnt == maxc && stride < maxs) {
maxs, maxc = stride, cnt
}
}
return maxs
}
func countSparseEntries(a []uint64) int {
stride := mostFrequentStride(a)
var v, count int
for _, tv := range a {
if int(tv)-v != stride {
if tv != 0 {
count++
}
}
v = int(tv)
}
return count
}
func (c *normCompacter) Size(v []uint64) (sz int, ok bool) {
if n := countSparseEntries(v); n <= maxSparseEntries {
return (n+1)*4 + 2, true
}
return 0, false
}
func (c *normCompacter) Store(v []uint64) uint32 {
h := uint32(len(c.sparseOffset))
c.sparseBlocks = append(c.sparseBlocks, v)
c.sparseOffset = append(c.sparseOffset, uint16(c.sparseCount))
c.sparseCount += countSparseEntries(v) + 1
return h
}
func (c *normCompacter) Handler() string {
return c.name + "Sparse.lookup"
}
func (c *normCompacter) Print(w io.Writer) (retErr error) {
p := func(f string, x ...interface{}) {
if _, err := fmt.Fprintf(w, f, x...); retErr == nil && err != nil {
retErr = err
}
}
ls := len(c.sparseBlocks)
p("// %sSparseOffset: %d entries, %d bytes\n", c.name, ls, ls*2)
p("var %sSparseOffset = %#v\n\n", c.name, c.sparseOffset)
ns := c.sparseCount
p("// %sSparseValues: %d entries, %d bytes\n", c.name, ns, ns*4)
p("var %sSparseValues = [%d]valueRange {", c.name, ns)
for i, b := range c.sparseBlocks {
p("\n// Block %#x, offset %#x", i, c.sparseOffset[i])
var v int
stride := mostFrequentStride(b)
n := countSparseEntries(b)
p("\n{value:%#04x,lo:%#02x},", stride, uint8(n))
for i, nv := range b {
if int(nv)-v != stride {
if v != 0 {
p(",hi:%#02x},", 0x80+i-1)
}
if nv != 0 {
p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
}
}
v = int(nv)
}
if v != 0 {
p(",hi:%#02x},", 0x80+len(b)-1)
}
}
p("\n}\n\n")
return
}