Skip to content

Double-byte characters in DBCLOB columns are mangled #274

@willdonnelly

Description

@willdonnelly

Text contents of DBCLOB columns appears to be mangled. This repro example:

// Minimal reproduction of DBCLOB encoding bug in go_ibm_db driver.
//
// This program demonstrates that the go_ibm_db driver mangles text when
// reading DBCLOB vs VARGRAPHIC columns, even though both store identical
// data in DB2.
package main

import (
	"database/sql"
	"fmt"
	"log"

	_ "github.com/ibmdb/go_ibm_db"
)

func main() {
	const connStr = "HOSTNAME=127.0.0.1;PORT=50000;DATABASE=testdb;UID=db2inst1;PWD=secret1234"
	db, err := sql.Open("go_ibm_db", connStr)
	if err != nil {
		log.Fatalf("failed to connect: %v", err)
	}
	defer db.Close()

	db.Exec("DROP TABLE dbclob_test")
	_, err = db.Exec(`CREATE TABLE dbclob_test (
		id INTEGER PRIMARY KEY NOT NULL,
		vargraphic_col VARGRAPHIC(100),
		dbclob_col DBCLOB
	)`)
	if err != nil {
		log.Fatalf("failed to create table: %v", err)
	}
	defer db.Exec("DROP TABLE dbclob_test")

	const testval = "Hello世界"
	_, err = db.Exec(fmt.Sprintf("INSERT INTO dbclob_test VALUES (1, G'%s', G'%s')", testval, testval))
	if err != nil {
		log.Fatalf("failed to insert: %v", err)
	}

	var vargraphic, dbclob, dbclobAsVarchar string
	err = db.QueryRow("SELECT vargraphic_col, dbclob_col, CAST(dbclob_col AS VARCHAR(200)) FROM dbclob_test").Scan(&vargraphic, &dbclob, &dbclobAsVarchar)
	if err != nil {
		log.Fatalf("failed to select: %v", err)
	}

	fmt.Printf("Input: %q\n", testval)
	fmt.Printf("%-20s %-34s %s\n", "", "Hex", "String")
	fmt.Printf("%-20s %-34x %q\n", "VARGRAPHIC", vargraphic, vargraphic)
	fmt.Printf("%-20s %-34x %q\n", "DBCLOB", dbclob, dbclob)
	fmt.Printf("%-20s %-34x %q\n", "DBCLOB as VARCHAR", dbclobAsVarchar, dbclobAsVarchar)
}

Produces the following output:

Input: "Hello世界"
                     Hex                                String
VARGRAPHIC           48656c6c6fe4b896e7958c             "Hello世界"
DBCLOB               48656c6c6f4e16754c0000000000       "HelloN\x16uL\x00\x00\x00\x00\x00"
DBCLOB as VARCHAR    48656c6c6fe4b896e7958c             "Hello世界"

What appears to be happening here is that the data arriving from the DB is a sequence of UTF-16BE codepoints, and the transform applied inside the client library just "strips" any null bytes (actually just moving them to the end of the string, due to the fixed-length string allocation) and then interprets the result as UTF-8, which is not correct handling for multi-byte characters. Something like the following appears to correct this behavior, however I have not tested it exhaustively:

diff --git a/column.go b/column.go
index 0f7cc2d..5e621b3 100644
--- a/column.go
+++ b/column.go
@@ -201,7 +201,7 @@ func (c *BaseColumn) Value(buf []byte) (driver.Value, error) {
                        return nil, nil
                }
                s := (*[1 << 20]uint8)(p)[:len(buf)]
-               return removeNulls(s), nil
+               return dbclobToUTF8(s), nil
        case api.SQL_C_TYPE_TIMESTAMP:
                t := (*api.SQL_TIMESTAMP_STRUCT)(p)
                r := time.Date(int(t.Year), time.Month(t.Month), int(t.Day),
diff --git a/sqlOut.go b/sqlOut.go
index 066f719..3af3b61 100644
--- a/sqlOut.go
+++ b/sqlOut.go
@@ -190,7 +190,7 @@ func (o *Out) Value() (driver.Value, error) {
                        return nil, nil
                }
                s := (*[1 << 20]uint8)(p)[:len(buf)]
-               return removeNulls(s), nil
+               return dbclobToUTF8(s), nil
        case api.SQL_C_TYPE_TIMESTAMP:
                t := (*api.SQL_TIMESTAMP_STRUCT)(p)
                r := time.Date(int(t.Year), time.Month(t.Month), int(t.Day),
diff --git a/utf16.go b/utf16.go
index 54444f3..2cea199 100644
--- a/utf16.go
+++ b/utf16.go
@@ -54,16 +54,21 @@ func utf16toutf8(s []uint16) []byte {
        return buf
 }

-//This func takes []uint8 array and then removes the null
-//and then returns []uint8
-func removeNulls(s []uint8) []uint8 {
-       buf := make([]uint8, len(s))
-       ind := 0
-       for _, v := range s {
-               if v != 0 {
-                       buf[ind] = v
-                       ind++
-               }
-       }
-       return buf
+// dbclobToUTF8 takes a slice of bytes representing the contents of a DBCLOB
+// column and converts them to UTF-8 by first interpreting them as UTF-16BE
+// data and then calling utf16toutf8().
+//
+// This could be done more efficiently by casting/swapping bytes in place.
+func dbclobToUTF8(bs []uint8) []byte {
+       if len(bs) % 2 != 0 {
+               // Since we should never have an odd number of bytes,
+               // truncating the odd byte is as good as anything else.
+               bs = bs[:len(bs)-1]
+       }
+
+       buf := make([]uint16, len(bs)/2)
+       for i := 0; i < len(buf); i++ {
+               buf[i] = uint16(bs[2*i])<<8 | uint16(bs[2*i+1])
+       }
+       return utf16toutf8(buf)
 }
Environment information:
$ go env
GO111MODULE=''
GOARCH='amd64'
GOBIN=''
GOCACHE='/home/will/.cache/go-build'
GOENV='/home/will/.config/go/env'
GOEXE=''
GOEXPERIMENT=''
GOFLAGS=''
GOHOSTARCH='amd64'
GOHOSTOS='linux'
GOINSECURE=''
GOMODCACHE='/home/will/go/pkg/mod'
GONOPROXY=''
GONOSUMDB=''
GOOS='linux'
GOPATH='/home/will/go'
GOPRIVATE=''
GOPROXY='https://proxy.golang.org,direct'
GOROOT='/usr/local/go'
GOSUMDB='sum.golang.org'
GOTMPDIR=''
GOTOOLCHAIN='auto'
GOTOOLDIR='/usr/local/go/pkg/tool/linux_amd64'
GOVCS=''
GOVERSION='go1.23.3'
GODEBUG=''
GOTELEMETRY='local'
GOTELEMETRYDIR='/home/will/.config/go/telemetry'
GCCGO='gccgo'
GOAMD64='v1'
AR='ar'
CC='gcc'
CXX='g++'
CGO_ENABLED='1'
GOMOD='/home/will/third-party/go_ibm_db/go.mod'
GOWORK=''
CGO_CFLAGS='-I/home/will/go/pkg/mod/github.com/ibmdb/clidriver/include'
CGO_CPPFLAGS=''
CGO_CXXFLAGS='-O2 -g'
CGO_FFLAGS='-O2 -g'
CGO_LDFLAGS='-L/home/will/go/pkg/mod/github.com/ibmdb/clidriver/lib'
PKG_CONFIG='pkg-config'
GOGCCFLAGS='-fPIC -m64 -pthread -Wl,--no-gc-sections -fmessage-length=0 -ffile-prefix-map=/tmp/go-build3187854290=/tmp/go-build -gno-record-gcc-switches'
$ grep ibmdb go.*
go.mod:	github.com/ibmdb/go_ibm_db v0.5.4 // indirect
go.sum:github.com/ibmdb/go_ibm_db v0.5.4 h1:cveEOt1J2PoQivQdxIQB0f8ugDJYKaSmh7RUKAaJyAE=
go.sum:github.com/ibmdb/go_ibm_db v0.5.4/go.mod h1:BA12Alfe+h5BMGZGE+b0pqP4leILZkpoxe5qr/iMoHw=

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions