-
Notifications
You must be signed in to change notification settings - Fork 41
Open
Description
Text contents of DBCLOB columns appears to be mangled. This repro example:
// Minimal reproduction of DBCLOB encoding bug in go_ibm_db driver.
//
// This program demonstrates that the go_ibm_db driver mangles text when
// reading DBCLOB vs VARGRAPHIC columns, even though both store identical
// data in DB2.
package main
import (
"database/sql"
"fmt"
"log"
_ "github.com/ibmdb/go_ibm_db"
)
func main() {
const connStr = "HOSTNAME=127.0.0.1;PORT=50000;DATABASE=testdb;UID=db2inst1;PWD=secret1234"
db, err := sql.Open("go_ibm_db", connStr)
if err != nil {
log.Fatalf("failed to connect: %v", err)
}
defer db.Close()
db.Exec("DROP TABLE dbclob_test")
_, err = db.Exec(`CREATE TABLE dbclob_test (
id INTEGER PRIMARY KEY NOT NULL,
vargraphic_col VARGRAPHIC(100),
dbclob_col DBCLOB
)`)
if err != nil {
log.Fatalf("failed to create table: %v", err)
}
defer db.Exec("DROP TABLE dbclob_test")
const testval = "Hello世界"
_, err = db.Exec(fmt.Sprintf("INSERT INTO dbclob_test VALUES (1, G'%s', G'%s')", testval, testval))
if err != nil {
log.Fatalf("failed to insert: %v", err)
}
var vargraphic, dbclob, dbclobAsVarchar string
err = db.QueryRow("SELECT vargraphic_col, dbclob_col, CAST(dbclob_col AS VARCHAR(200)) FROM dbclob_test").Scan(&vargraphic, &dbclob, &dbclobAsVarchar)
if err != nil {
log.Fatalf("failed to select: %v", err)
}
fmt.Printf("Input: %q\n", testval)
fmt.Printf("%-20s %-34s %s\n", "", "Hex", "String")
fmt.Printf("%-20s %-34x %q\n", "VARGRAPHIC", vargraphic, vargraphic)
fmt.Printf("%-20s %-34x %q\n", "DBCLOB", dbclob, dbclob)
fmt.Printf("%-20s %-34x %q\n", "DBCLOB as VARCHAR", dbclobAsVarchar, dbclobAsVarchar)
}Produces the following output:
Input: "Hello世界"
Hex String
VARGRAPHIC 48656c6c6fe4b896e7958c "Hello世界"
DBCLOB 48656c6c6f4e16754c0000000000 "HelloN\x16uL\x00\x00\x00\x00\x00"
DBCLOB as VARCHAR 48656c6c6fe4b896e7958c "Hello世界"
What appears to be happening here is that the data arriving from the DB is a sequence of UTF-16BE codepoints, and the transform applied inside the client library just "strips" any null bytes (actually just moving them to the end of the string, due to the fixed-length string allocation) and then interprets the result as UTF-8, which is not correct handling for multi-byte characters. Something like the following appears to correct this behavior, however I have not tested it exhaustively:
diff --git a/column.go b/column.go
index 0f7cc2d..5e621b3 100644
--- a/column.go
+++ b/column.go
@@ -201,7 +201,7 @@ func (c *BaseColumn) Value(buf []byte) (driver.Value, error) {
return nil, nil
}
s := (*[1 << 20]uint8)(p)[:len(buf)]
- return removeNulls(s), nil
+ return dbclobToUTF8(s), nil
case api.SQL_C_TYPE_TIMESTAMP:
t := (*api.SQL_TIMESTAMP_STRUCT)(p)
r := time.Date(int(t.Year), time.Month(t.Month), int(t.Day),
diff --git a/sqlOut.go b/sqlOut.go
index 066f719..3af3b61 100644
--- a/sqlOut.go
+++ b/sqlOut.go
@@ -190,7 +190,7 @@ func (o *Out) Value() (driver.Value, error) {
return nil, nil
}
s := (*[1 << 20]uint8)(p)[:len(buf)]
- return removeNulls(s), nil
+ return dbclobToUTF8(s), nil
case api.SQL_C_TYPE_TIMESTAMP:
t := (*api.SQL_TIMESTAMP_STRUCT)(p)
r := time.Date(int(t.Year), time.Month(t.Month), int(t.Day),
diff --git a/utf16.go b/utf16.go
index 54444f3..2cea199 100644
--- a/utf16.go
+++ b/utf16.go
@@ -54,16 +54,21 @@ func utf16toutf8(s []uint16) []byte {
return buf
}
-//This func takes []uint8 array and then removes the null
-//and then returns []uint8
-func removeNulls(s []uint8) []uint8 {
- buf := make([]uint8, len(s))
- ind := 0
- for _, v := range s {
- if v != 0 {
- buf[ind] = v
- ind++
- }
- }
- return buf
+// dbclobToUTF8 takes a slice of bytes representing the contents of a DBCLOB
+// column and converts them to UTF-8 by first interpreting them as UTF-16BE
+// data and then calling utf16toutf8().
+//
+// This could be done more efficiently by casting/swapping bytes in place.
+func dbclobToUTF8(bs []uint8) []byte {
+ if len(bs) % 2 != 0 {
+ // Since we should never have an odd number of bytes,
+ // truncating the odd byte is as good as anything else.
+ bs = bs[:len(bs)-1]
+ }
+
+ buf := make([]uint16, len(bs)/2)
+ for i := 0; i < len(buf); i++ {
+ buf[i] = uint16(bs[2*i])<<8 | uint16(bs[2*i+1])
+ }
+ return utf16toutf8(buf)
}Environment information:
$ go env
GO111MODULE=''
GOARCH='amd64'
GOBIN=''
GOCACHE='/home/will/.cache/go-build'
GOENV='/home/will/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMODCACHE='/home/will/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='linux'
GOPATH='/home/will/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.23.3'
GODEBUG=''
GOTELEMETRY='local'
GOTELEMETRYDIR='/home/will/.config/go/telemetry'
GCCGO='gccgo'
GOAMD64='v1'
AR='ar'
CC='gcc'
CXX='g++'
CGO_ENABLED='1'
GOMOD='/home/will/third-party/go_ibm_db/go.mod'
GOWORK=''
CGO_CFLAGS='-I/home/will/go/pkg/mod/github.com/ibmdb/clidriver/include'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-L/home/will/go/pkg/mod/github.com/ibmdb/clidriver/lib'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build3187854290=/tmp/go-build -gno-record-gcc-switches'
$ grep ibmdb go.*
go.mod: github.com/ibmdb/go_ibm_db v0.5.4 // indirect
go.sum:github.com/ibmdb/go_ibm_db v0.5.4 h1:cveEOt1J2PoQivQdxIQB0f8ugDJYKaSmh7RUKAaJyAE=
go.sum:github.com/ibmdb/go_ibm_db v0.5.4/go.mod h1:BA12Alfe+h5BMGZGE+b0pqP4leILZkpoxe5qr/iMoHw=Metadata
Metadata
Assignees
Labels
No labels