> > I've recently run into issues when using JSON.parse on ODB keys containing
> > 8-bit data.
>
> I am tempted to take a hard line and say that in general MIDAS TID_STRING data should be valid
> UTF-8 encoded Unicode. In the modern mixed javascript/json/whatever environment I think
> it is impractical to handle or permit invalid UTF-8 strings.
>
> Certainly in the general case, replacing all control characters with something else or escaping them or
> otherwise changing the value if TID_STRING data would wreck *valid* UTF-8 strings, which I would
> assume to be the normal use.
>
> In other words, non-UTF-8 strings are following non-IEEE-754 floating point values into oblivion - as
> we do not check the TID_FLOAT and TID_DOUBLE is valid IEEE-754 values, we should not check
> that TID_STRING is valid UTF-8.
I agree that I think we should start requiring strings to be UTF-8 encoded unicode.
I'd suggest that before worrying about the TID_STRING data, we should start by sanitizing the ODB key names.
I've seen a couple cases where the ODB key name is a non-UTF-8 string. It is very awkward to use odbedit
to delete these keys.
I attach a suggested modification to odb.c that rejects calls to db_create_key with non-UTF-8 key names. It
uses some random function I found on the internet that is supposed to check if a string is valid UTF-8. I
checked a couple of strings with invalid UTF-8 characters and it correctly identified them. But I won't
claim to be certain that this is really identifying all UTF-8 vs non-UTF-8 cases. Maybe others have a
better way of identifying this. |
diff --git a/src/odb.c b/src/odb.c
index 47ace8f..041080e 100755
--- a/src/odb.c
+++ b/src/odb.c
@@ -1818,6 +1818,90 @@ BOOL equal_ustring(const char *str1, const char *str2)
return TRUE;
}
+// Method to check if a given string is valid UTF-8. Returns 1 if it is.
+BOOL is_utf8(const char * string)
+{
+ if(!string)
+ return 0;
+
+ const unsigned char * bytes = (const unsigned char *)string;
+ while(*bytes)
+ {
+ if( (// ASCII
+ // use bytes[0] <= 0x7F to allow ASCII control characters
+ bytes[0] == 0x09 ||
+ bytes[0] == 0x0A ||
+ bytes[0] == 0x0D ||
+ (0x20 <= bytes[0] && bytes[0] <= 0x7E)
+ )
+ ) {
+ bytes += 1;
+ continue;
+ }
+
+ if( (// non-overlong 2-byte
+ (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
+ (0x80 <= bytes[1] && bytes[1] <= 0xBF)
+ )
+ ) {
+ bytes += 2;
+ continue;
+ }
+
+ if( (// excluding overlongs
+ bytes[0] == 0xE0 &&
+ (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF)
+ ) ||
+ (// straight 3-byte
+ ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
+ bytes[0] == 0xEE ||
+ bytes[0] == 0xEF) &&
+ (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF)
+ ) ||
+ (// excluding surrogates
+ bytes[0] == 0xED &&
+ (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF)
+ )
+ ) {
+ bytes += 3;
+ continue;
+ }
+
+ if( (// planes 1-3
+ bytes[0] == 0xF0 &&
+ (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
+ (0x80 <= bytes[3] && bytes[3] <= 0xBF)
+ ) ||
+ (// planes 4-15
+ (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
+ (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
+ (0x80 <= bytes[3] && bytes[3] <= 0xBF)
+ ) ||
+ (// plane 16
+ bytes[0] == 0xF4 &&
+ (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
+ (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
+ (0x80 <= bytes[3] && bytes[3] <= 0xBF)
+ )
+ ) {
+ bytes += 4;
+ continue;
+ }
+
+ return 0;
+ }
+
+ return 1;
+}
+
+
+
+
/********************************************************************/
/**
Create a new key in a database
@@ -1829,6 +1913,12 @@ Create a new key in a database
*/
INT db_create_key(HNDLE hDB, HNDLE hKey, const char *key_name, DWORD type)
{
+
+ if(!is_utf8(key_name)){
+ cm_msg(MERROR, "db_create_key", "invalid non-UTF-8 key name \'%s\'", key_name);
+ return DB_INVALID_PARAM;
+ }
+
if (rpc_is_remote())
return rpc_call(RPC_DB_CREATE_KEY, hDB, hKey, key_name, type);
|