Fix query rewriting unproperly with multibyte characters

Description =========== Babelfish doesn't rewrite query with multibyte characters properly. Analysis ========= Babelfish preprocess the query string and remove unsupported syntax before sending the query to PG backend. The implementation didn’t consider multibyte unicode characters, so when unicode characters are used ahead of the unsupported syntax, Babelfish will emit a broken query. In specific, character offset is used instead of byte offset during the character replacement. For example: Input T-SQL : select "你好世界" from tbl with(nolock); Executed SQL: select "你好世界" f (nolock); Solution ======== Consolidate all rewriting behaviors to PLtsql_expr_query_mutator. It’s more maintainable because there will only be one interface for query rewriting. We support Chinese unicode charset as identifier in this patch.
markrui3 · Jan 26, 2023 · 7c4cbad · 7c4cbad
1 parent c32d387
commit 7c4cbad
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 100 deletions.
diff --git a/contrib/babelfishpg_tsql/antlr/TSqlLexer.g4 b/contrib/babelfishpg_tsql/antlr/TSqlLexer.g4
@@ -1238,94 +1238,95 @@ fragment LETTER
     | '\u03a3'..'\u03ce'
     | '\u03d0'..'\u03d7'
     | '\u03da'..'\u03f3'
-//    | '\u0400'..'\u0481'  // Cyrillic
-//    | '\u048c'..'\u04c4'
-//    | '\u04c7'..'\u04c8'
-//    | '\u04cb'..'\u04cc'
-//    | '\u04d0'..'\u04f5'
-//    | '\u04f8'..'\u04f9'
-//    | '\u05d0'..'\u05ea'  // Hebrew
-//    | '\u0621'..'\u063a'  // Arabic
-//    | '\u0641'..'\u064a'
-//    | '\u0660'..'\u0669'
-//    | '\u0671'..'\u06d3'
-//    | '\u06d5'
-//    | '\u06f0'..'\u06f9'
-//    | '\u06fa'..'\u06fc'
-//    | '\u0e01'..'\u0e5b'  // Thai
-//    | '\u1100'..'\u1159'  // Hangul/Korean
-//    | '\u1161'..'\u11a2'
-//    | '\u11a8'..'\u11f9'
-//    | '\u1e00'..'\u1e9b'  // Latin Extended Additional
-//    | '\u1ea0'..'\u1ef9'
-//    | '\u1f00'..'\u1f15'  // Greek Extended
-//    | '\u1f18'..'\u1f1d'
-//    | '\u1f20'..'\u1f45'
-//    | '\u1f48'..'\u1f4d'
-//    | '\u1f50'..'\u1f57'
-//    | '\u1f59'
-//    | '\u1f5b'
-//    | '\u1f5d'
-//    | '\u1f5f'..'\u1f7d'
-//    | '\u1f80'..'\u1fb4'
-//    | '\u1fb6'..'\u1fbc'
-//    | '\u1fc2'..'\u1fc4'
-//    | '\u1fc6'..'\u1fcc'
-//    | '\u1fd0'..'\u1fd3'
-//    | '\u1fd6'..'\u1fdb'
-//    | '\u1fe0'..'\u1fec'
-//    | '\u1ff2'..'\u1ff4'
-//    | '\u1ff6'..'\u1ffc'
-//    | '\u210a'..'\u2113' // Letter-like symbols
-//    | '\u2118'..'\u211d'
-//    | '\u212a'..'\u212d'
-//    | '\u212f'..'\u2131'
-//    | '\u2133'..'\u2138'
-//    | '\u2160'..'\u2183' // Roman Numeral
-//    | '\u2460'..'\u24ea' // Enclosed Alphanumerics
-//    | '\u2e80'..'\u2ef3' // CJK Radicals Supplement
-//    | '\u2f00'..'\u2fd5' // Kangxi Radicals
-//    | '\u3021'..'\u3029' // CJK
-//    | '\u3031'..'\u3035'
-//    | '\u3038'..'\u303a'
-//    | '\u3041'..'\u3094' // Hiragana
-//    | '\u309d'..'\u309e'
-//    | '\u30a1'..'\u30fa' // Katakana
-//    | '\u30fc'..'\u30fe'
-//    | '\u3105'..'\u312c' // Bopomofo
-//    | '\u3131'..'\u318e' // Hangul Compatability Jamo
-//    | '\u31a0'..'\u31b7' // Bopomofo Extended
-//    | '\ua000'..'\ua48c' // Yi Syllables
-//    | '\uac00'           // Hangul Syllables
-//    | '\ud7a3'
-//    | '\uf900'..'\ufa2d' // CJK Compatibility Ideographs
-//    | '\ufb00'..'\ufb06' // Alphabetic Presentation Forms
-//    | '\ufb13'..'\ufb17'
-//    | '\ufb1d'
-//    | '\ufb1f'..'\ufb28'
-//    | '\ufb2a'..'\ufb36'
-//    | '\ufb38'..'\ufb3c'
-//    | '\ufb3e'
-//    | '\ufb40'..'\ufb41'
-//    | '\ufb43'..'\ufb44'
-//    | '\ufb46'..'\ufb4f'
-//    | '\ufb50'..'\ufbb1' // Arabic Presentation Forms-A
-//    | '\ufbd3'..'\ufd3d'
-//    | '\ufd50'..'\ufd8f'
-//    | '\ufd92'..'\ufdc7'
-//    | '\ufdf0'..'\ufdfb'
-//    | '\ufe70'..'\ufe72' // Arabic Presentation Forms-B
-//    | '\ufe74'
-//    | '\ufe76'..'\ufefc'
-//    | '\uff21'..'\uff3a' // Halfwidth and Fullwidth Forms
-//    | '\uff41'..'\uff5a'
-//    | '\uff66'..'\uffbe'
-//    | '\uffc2'..'\uffc7'
-//    | '\uffca'..'\uffcf'
-//    | '\uffd2'..'\uffd7'
-//    | '\uffda'..'\uffdc'
-//    | '\u10000'..'\u1F9FF'  //not supporting 4-byte chars
-//    | '\u20000'..'\u2FA1F'
+    | '\u0400'..'\u0481'  // Cyrillic
+    | '\u048c'..'\u04c4'
+    | '\u04c7'..'\u04c8'
+    | '\u04cb'..'\u04cc'
+    | '\u04d0'..'\u04f5'
+    | '\u04f8'..'\u04f9'
+    | '\u05d0'..'\u05ea'  // Hebrew
+    | '\u0621'..'\u063a'  // Arabic
+    | '\u0641'..'\u064a'
+    | '\u0660'..'\u0669'
+    | '\u0671'..'\u06d3'
+    | '\u06d5'
+    | '\u06f0'..'\u06f9'
+    | '\u06fa'..'\u06fc'
+    | '\u0e01'..'\u0e5b'  // Thai
+    | '\u1100'..'\u1159'  // Hangul/Korean
+    | '\u1161'..'\u11a2'
+    | '\u11a8'..'\u11f9'
+    | '\u1e00'..'\u1e9b'  // Latin Extended Additional
+    | '\u1ea0'..'\u1ef9'
+    | '\u1f00'..'\u1f15'  // Greek Extended
+    | '\u1f18'..'\u1f1d'
+    | '\u1f20'..'\u1f45'
+    | '\u1f48'..'\u1f4d'
+    | '\u1f50'..'\u1f57'
+    | '\u1f59'
+    | '\u1f5b'
+    | '\u1f5d'
+    | '\u1f5f'..'\u1f7d'
+    | '\u1f80'..'\u1fb4'
+    | '\u1fb6'..'\u1fbc'
+    | '\u1fc2'..'\u1fc4'
+    | '\u1fc6'..'\u1fcc'
+    | '\u1fd0'..'\u1fd3'
+    | '\u1fd6'..'\u1fdb'
+    | '\u1fe0'..'\u1fec'
+    | '\u1ff2'..'\u1ff4'
+    | '\u1ff6'..'\u1ffc'
+    | '\u210a'..'\u2113' // Letter-like symbols
+    | '\u2118'..'\u211d'
+    | '\u212a'..'\u212d'
+    | '\u212f'..'\u2131'
+    | '\u2133'..'\u2138'
+    | '\u2160'..'\u2183' // Roman Numeral
+    | '\u2460'..'\u24ea' // Enclosed Alphanumerics
+    | '\u2e80'..'\u2ef3' // CJK Radicals Supplement
+    | '\u2f00'..'\u2fd5' // Kangxi Radicals
+    | '\u3021'..'\u3029' // CJK
+    | '\u3031'..'\u3035'
+    | '\u3038'..'\u303a'
+    | '\u3041'..'\u3094' // Hiragana
+    | '\u309d'..'\u309e'
+    | '\u30a1'..'\u30fa' // Katakana
+    | '\u30fc'..'\u30fe'
+    | '\u3105'..'\u312c' // Bopomofo
+    | '\u3131'..'\u318e' // Hangul Compatability Jamo
+    | '\u31a0'..'\u31b7' // Bopomofo Extended
+    | '\u4e00'..'\u9fa5' // Chinese
+    | '\ua000'..'\ua48c' // Yi Syllables
+    | '\uac00'           // Hangul Syllables
+    | '\ud7a3'
+    | '\uf900'..'\ufa2d' // CJK Compatibility Ideographs
+    | '\ufb00'..'\ufb06' // Alphabetic Presentation Forms
+    | '\ufb13'..'\ufb17'
+    | '\ufb1d'
+    | '\ufb1f'..'\ufb28'
+    | '\ufb2a'..'\ufb36'
+    | '\ufb38'..'\ufb3c'
+    | '\ufb3e'
+    | '\ufb40'..'\ufb41'
+    | '\ufb43'..'\ufb44'
+    | '\ufb46'..'\ufb4f'
+    | '\ufb50'..'\ufbb1' // Arabic Presentation Forms-A
+    | '\ufbd3'..'\ufd3d'
+    | '\ufd50'..'\ufd8f'
+    | '\ufd92'..'\ufdc7'
+    | '\ufdf0'..'\ufdfb'
+    | '\ufe70'..'\ufe72' // Arabic Presentation Forms-B
+    | '\ufe74'
+    | '\ufe76'..'\ufefc'
+    | '\uff21'..'\uff3a' // Halfwidth and Fullwidth Forms
+    | '\uff41'..'\uff5a'
+    | '\uff66'..'\uffbe'
+    | '\uffc2'..'\uffc7'
+    | '\uffca'..'\uffcf'
+    | '\uffd2'..'\uffd7'
+    | '\uffda'..'\uffdc'
+    | '\u{10000}'..'\u{1F9FF}'
+    | '\u{20000}'..'\u{2FA1F}'
     ;
 
 

diff --git a/contrib/babelfishpg_tsql/src/tsqlIface.cpp b/contrib/babelfishpg_tsql/src/tsqlIface.cpp
@@ -534,8 +534,6 @@ void PLtsql_expr_query_mutator::add(int antlr_pos, std::string orig_text, std::s
 		throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. offset value is negative", 0, 0);
 	if (offset > (int)strlen(expr->query))
 		throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. offset value is too large", 0, 0);
-	if (m.find(offset) != m.end())
-		throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. mulitiple mutation on the same position", 0, 0);
 
 	m.emplace(std::make_pair(offset, std::make_pair(orig_text, repl_text)));
 }
@@ -1544,6 +1542,9 @@ class tsqlBuilder : public tsqlCommonMutator
 				throw PGErrorWrapperException(ERROR, ERRCODE_INVALID_FUNCTION_DEFINITION, "'DELETE' cannot be used within a function", getLineAndPos(ctx->delete_statement()->delete_statement_from()->ddl_object()));
 		}
 
+		/* we must add previous rewrite at first. */
+		add_rewritten_query_fragment_to_mutator(statementMutator.get());
+
 		// post-processing of execsql stmt query
 		for (auto &entry : local_id_positions)
 		{
@@ -1552,9 +1553,6 @@ class tsqlBuilder : public tsqlCommonMutator
 			statementMutator->add(entry.first, entry.second, quoted_local_id);
 		}
 
-		/* common routine for select and non-select */
-		add_rewritten_query_fragment_to_mutator(statementMutator.get());
-
 		/* Add query hints */
 		if (query_hints.size() && enable_hint_mapping)
 		{
@@ -2889,6 +2887,7 @@ rewriteBatchLevelStatement(
 	// Run select statement mutator
 	antlr4::tree::ParseTreeWalker walker;
 	walker.walk(ssm, ctx);
+	add_rewritten_query_fragment_to_mutator(&mutator);
 
 	mutator.run();
 	ssm->mutator = nullptr;
@@ -3456,18 +3455,17 @@ void replaceTokenStringFromQuery(PLtsql_expr* expr, Token* startToken, Token* en
 		throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));
 
 	size_t baseIdx = baseCtx->getStart()->getStartIndex();
-	if (endIdx == INVALID_INDEX)
+	if (baseIdx == INVALID_INDEX)
 		throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));
 
 	// repl string is too long. we cannot replace with it in place.
 	if (repl && strlen(repl) > endIdx - startIdx + 1)
 		throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));
 
 	Assert(expr->query);
-	memset(expr->query + startIdx - baseIdx, ' ', endIdx - startIdx + 1);
 
-	if (repl)
-		memcpy(expr->query + startIdx - baseIdx, repl, strlen(repl));
+	/* store and rewrite instead of in-place rewrite */
+	rewritten_query_fragment.emplace(std::make_pair(startIdx, std::make_pair(startToken->getInputStream()->getText(misc::Interval(startIdx, endIdx)), repl ? std::string(repl) : std::string(endIdx - startIdx + 1, ' '))));
 }
 
 void replaceTokenStringFromQuery(PLtsql_expr* expr, TerminalNode* tokenNode, const char * repl, ParserRuleContext *baseCtx)
@@ -3911,12 +3909,16 @@ makeReturnQueryStmt(TSqlParser::Select_statement_standaloneContext *ctx, bool it
 		if (base_index == INVALID_INDEX)
 			throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(ctx));
 
-		auto *query = itvf_expr->query;
+		/* we must add previous rewrite at first. */
+		add_rewritten_query_fragment_to_mutator(&itvf_mutator);
+
+		std::u32string query = utf8_to_utf32(itvf_expr->query);
 		for (const auto &entry : local_id_positions)
 		{
 			const std::string& local_id = entry.second;
+			const std::u32string& local_id_u32 = utf8_to_utf32(local_id.c_str());
 			size_t offset = entry.first - base_index;
-			if (strncmp(local_id.c_str(), query+offset, local_id.length()) == 0) // local_id maybe already deleted in some cases such as select-assignment. check here if it still exists)
+			if (query.substr(offset, local_id_u32.length()) == local_id_u32) // local_id maybe already deleted in some cases such as select-assignment. check here if it still exists)
 			{
 				int dno;
 				PLtsql_nsitem *nse = pltsql_ns_lookup(pltsql_ns_top(), false, local_id.c_str(), nullptr, nullptr, nullptr);
@@ -3930,7 +3932,6 @@ makeReturnQueryStmt(TSqlParser::Select_statement_standaloneContext *ctx, bool it
 				itvf_mutator.add(entry.first, entry.second, repl_text);
 			}
 		}
-		add_rewritten_query_fragment_to_mutator(&itvf_mutator);
 		itvf_mutator.run();
 		result->query->itvf_query = itvf_expr->query;
 	}

diff --git a/test/JDBC/expected/babel_unicode_charset.out b/test/JDBC/expected/babel_unicode_charset.out
@@ -0,0 +1,46 @@
+drop table if exists unicode_test;
+go
+create table unicode_test(col nvarchar(255), 中文列名 nvarchar(255));
+go
+insert into unicode_test values('Hello', '你好');
+go
+~~ROW COUNT: 1~~
+
+insert into unicode_test values('World', '世界');
+go
+~~ROW COUNT: 1~~
+
+
+/* multibyte characters as identifier */
+select col 别名 from unicode_test;
+go
+~~START~~
+nvarchar
+Hello
+World
+~~END~~
+
+select 别名=col from unicode_test;
+go
+~~START~~
+nvarchar
+Hello
+World
+~~END~~
+
+
+/* multibyte characters with unsupported token */
+select "你好世界" from unicode_test with(nolock);
+go
+~~ERROR (Code: 33557097)~~
+
+~~ERROR (Message: column "你好世界" does not exist)~~
+
+select 中文列名 from unicode_test with(nolock);
+go
+~~START~~
+nvarchar
+你好
+世界
+~~END~~
+
diff --git a/test/JDBC/input/babel_unicode_charset.sql b/test/JDBC/input/babel_unicode_charset.sql
@@ -0,0 +1,20 @@
+drop table if exists unicode_test;
+go
+create table unicode_test(col nvarchar(255), 中文列名 nvarchar(255));
+go
+insert into unicode_test values('Hello', '你好');
+go
+insert into unicode_test values('World', '世界');
+go
+
+/* multibyte characters as identifier */
+select col 别名 from unicode_test;
+go
+select 别名=col from unicode_test;
+go
+
+/* multibyte characters with unsupported token */
+select "你好世界" from unicode_test with(nolock);
+go
+select 中文列名 from unicode_test with(nolock);
+go