Skip to content

Commit

Permalink
Fix query rewriting unproperly with multibyte characters
Browse files Browse the repository at this point in the history
Description
===========
Babelfish doesn't rewrite query with multibyte characters properly.

Analysis
=========
Babelfish preprocess the query string and remove unsupported syntax before
sending the query to PG backend. The implementation didn’t consider multibyte
unicode characters, so when unicode characters are used ahead of the unsupported
syntax, Babelfish will emit a broken query. In specific, character offset is
used instead of byte offset during the character replacement.
For example:
Input T-SQL : select "你好世界" from tbl with(nolock);
Executed SQL: select "你好世界" f            (nolock);

Solution
========
Consolidate all rewriting behaviors to PLtsql_expr_query_mutator. It’s more
maintainable because there will only be one interface for query rewriting.
We support Chinese unicode charset as identifier in this patch.
  • Loading branch information
markrui3 committed Jan 26, 2023
1 parent c32d387 commit 7c4cbad
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 100 deletions.
177 changes: 89 additions & 88 deletions contrib/babelfishpg_tsql/antlr/TSqlLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -1238,94 +1238,95 @@ fragment LETTER
| '\u03a3'..'\u03ce'
| '\u03d0'..'\u03d7'
| '\u03da'..'\u03f3'
// | '\u0400'..'\u0481' // Cyrillic
// | '\u048c'..'\u04c4'
// | '\u04c7'..'\u04c8'
// | '\u04cb'..'\u04cc'
// | '\u04d0'..'\u04f5'
// | '\u04f8'..'\u04f9'
// | '\u05d0'..'\u05ea' // Hebrew
// | '\u0621'..'\u063a' // Arabic
// | '\u0641'..'\u064a'
// | '\u0660'..'\u0669'
// | '\u0671'..'\u06d3'
// | '\u06d5'
// | '\u06f0'..'\u06f9'
// | '\u06fa'..'\u06fc'
// | '\u0e01'..'\u0e5b' // Thai
// | '\u1100'..'\u1159' // Hangul/Korean
// | '\u1161'..'\u11a2'
// | '\u11a8'..'\u11f9'
// | '\u1e00'..'\u1e9b' // Latin Extended Additional
// | '\u1ea0'..'\u1ef9'
// | '\u1f00'..'\u1f15' // Greek Extended
// | '\u1f18'..'\u1f1d'
// | '\u1f20'..'\u1f45'
// | '\u1f48'..'\u1f4d'
// | '\u1f50'..'\u1f57'
// | '\u1f59'
// | '\u1f5b'
// | '\u1f5d'
// | '\u1f5f'..'\u1f7d'
// | '\u1f80'..'\u1fb4'
// | '\u1fb6'..'\u1fbc'
// | '\u1fc2'..'\u1fc4'
// | '\u1fc6'..'\u1fcc'
// | '\u1fd0'..'\u1fd3'
// | '\u1fd6'..'\u1fdb'
// | '\u1fe0'..'\u1fec'
// | '\u1ff2'..'\u1ff4'
// | '\u1ff6'..'\u1ffc'
// | '\u210a'..'\u2113' // Letter-like symbols
// | '\u2118'..'\u211d'
// | '\u212a'..'\u212d'
// | '\u212f'..'\u2131'
// | '\u2133'..'\u2138'
// | '\u2160'..'\u2183' // Roman Numeral
// | '\u2460'..'\u24ea' // Enclosed Alphanumerics
// | '\u2e80'..'\u2ef3' // CJK Radicals Supplement
// | '\u2f00'..'\u2fd5' // Kangxi Radicals
// | '\u3021'..'\u3029' // CJK
// | '\u3031'..'\u3035'
// | '\u3038'..'\u303a'
// | '\u3041'..'\u3094' // Hiragana
// | '\u309d'..'\u309e'
// | '\u30a1'..'\u30fa' // Katakana
// | '\u30fc'..'\u30fe'
// | '\u3105'..'\u312c' // Bopomofo
// | '\u3131'..'\u318e' // Hangul Compatability Jamo
// | '\u31a0'..'\u31b7' // Bopomofo Extended
// | '\ua000'..'\ua48c' // Yi Syllables
// | '\uac00' // Hangul Syllables
// | '\ud7a3'
// | '\uf900'..'\ufa2d' // CJK Compatibility Ideographs
// | '\ufb00'..'\ufb06' // Alphabetic Presentation Forms
// | '\ufb13'..'\ufb17'
// | '\ufb1d'
// | '\ufb1f'..'\ufb28'
// | '\ufb2a'..'\ufb36'
// | '\ufb38'..'\ufb3c'
// | '\ufb3e'
// | '\ufb40'..'\ufb41'
// | '\ufb43'..'\ufb44'
// | '\ufb46'..'\ufb4f'
// | '\ufb50'..'\ufbb1' // Arabic Presentation Forms-A
// | '\ufbd3'..'\ufd3d'
// | '\ufd50'..'\ufd8f'
// | '\ufd92'..'\ufdc7'
// | '\ufdf0'..'\ufdfb'
// | '\ufe70'..'\ufe72' // Arabic Presentation Forms-B
// | '\ufe74'
// | '\ufe76'..'\ufefc'
// | '\uff21'..'\uff3a' // Halfwidth and Fullwidth Forms
// | '\uff41'..'\uff5a'
// | '\uff66'..'\uffbe'
// | '\uffc2'..'\uffc7'
// | '\uffca'..'\uffcf'
// | '\uffd2'..'\uffd7'
// | '\uffda'..'\uffdc'
// | '\u10000'..'\u1F9FF' //not supporting 4-byte chars
// | '\u20000'..'\u2FA1F'
| '\u0400'..'\u0481' // Cyrillic
| '\u048c'..'\u04c4'
| '\u04c7'..'\u04c8'
| '\u04cb'..'\u04cc'
| '\u04d0'..'\u04f5'
| '\u04f8'..'\u04f9'
| '\u05d0'..'\u05ea' // Hebrew
| '\u0621'..'\u063a' // Arabic
| '\u0641'..'\u064a'
| '\u0660'..'\u0669'
| '\u0671'..'\u06d3'
| '\u06d5'
| '\u06f0'..'\u06f9'
| '\u06fa'..'\u06fc'
| '\u0e01'..'\u0e5b' // Thai
| '\u1100'..'\u1159' // Hangul/Korean
| '\u1161'..'\u11a2'
| '\u11a8'..'\u11f9'
| '\u1e00'..'\u1e9b' // Latin Extended Additional
| '\u1ea0'..'\u1ef9'
| '\u1f00'..'\u1f15' // Greek Extended
| '\u1f18'..'\u1f1d'
| '\u1f20'..'\u1f45'
| '\u1f48'..'\u1f4d'
| '\u1f50'..'\u1f57'
| '\u1f59'
| '\u1f5b'
| '\u1f5d'
| '\u1f5f'..'\u1f7d'
| '\u1f80'..'\u1fb4'
| '\u1fb6'..'\u1fbc'
| '\u1fc2'..'\u1fc4'
| '\u1fc6'..'\u1fcc'
| '\u1fd0'..'\u1fd3'
| '\u1fd6'..'\u1fdb'
| '\u1fe0'..'\u1fec'
| '\u1ff2'..'\u1ff4'
| '\u1ff6'..'\u1ffc'
| '\u210a'..'\u2113' // Letter-like symbols
| '\u2118'..'\u211d'
| '\u212a'..'\u212d'
| '\u212f'..'\u2131'
| '\u2133'..'\u2138'
| '\u2160'..'\u2183' // Roman Numeral
| '\u2460'..'\u24ea' // Enclosed Alphanumerics
| '\u2e80'..'\u2ef3' // CJK Radicals Supplement
| '\u2f00'..'\u2fd5' // Kangxi Radicals
| '\u3021'..'\u3029' // CJK
| '\u3031'..'\u3035'
| '\u3038'..'\u303a'
| '\u3041'..'\u3094' // Hiragana
| '\u309d'..'\u309e'
| '\u30a1'..'\u30fa' // Katakana
| '\u30fc'..'\u30fe'
| '\u3105'..'\u312c' // Bopomofo
| '\u3131'..'\u318e' // Hangul Compatability Jamo
| '\u31a0'..'\u31b7' // Bopomofo Extended
| '\u4e00'..'\u9fa5' // Chinese
| '\ua000'..'\ua48c' // Yi Syllables
| '\uac00' // Hangul Syllables
| '\ud7a3'
| '\uf900'..'\ufa2d' // CJK Compatibility Ideographs
| '\ufb00'..'\ufb06' // Alphabetic Presentation Forms
| '\ufb13'..'\ufb17'
| '\ufb1d'
| '\ufb1f'..'\ufb28'
| '\ufb2a'..'\ufb36'
| '\ufb38'..'\ufb3c'
| '\ufb3e'
| '\ufb40'..'\ufb41'
| '\ufb43'..'\ufb44'
| '\ufb46'..'\ufb4f'
| '\ufb50'..'\ufbb1' // Arabic Presentation Forms-A
| '\ufbd3'..'\ufd3d'
| '\ufd50'..'\ufd8f'
| '\ufd92'..'\ufdc7'
| '\ufdf0'..'\ufdfb'
| '\ufe70'..'\ufe72' // Arabic Presentation Forms-B
| '\ufe74'
| '\ufe76'..'\ufefc'
| '\uff21'..'\uff3a' // Halfwidth and Fullwidth Forms
| '\uff41'..'\uff5a'
| '\uff66'..'\uffbe'
| '\uffc2'..'\uffc7'
| '\uffca'..'\uffcf'
| '\uffd2'..'\uffd7'
| '\uffda'..'\uffdc'
| '\u{10000}'..'\u{1F9FF}'
| '\u{20000}'..'\u{2FA1F}'
;


Expand Down
25 changes: 13 additions & 12 deletions contrib/babelfishpg_tsql/src/tsqlIface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,6 @@ void PLtsql_expr_query_mutator::add(int antlr_pos, std::string orig_text, std::s
throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. offset value is negative", 0, 0);
if (offset > (int)strlen(expr->query))
throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. offset value is too large", 0, 0);
if (m.find(offset) != m.end())
throw PGErrorWrapperException(ERROR, ERRCODE_INTERNAL_ERROR, "can't mutate an internal query. mulitiple mutation on the same position", 0, 0);

m.emplace(std::make_pair(offset, std::make_pair(orig_text, repl_text)));
}
Expand Down Expand Up @@ -1544,6 +1542,9 @@ class tsqlBuilder : public tsqlCommonMutator
throw PGErrorWrapperException(ERROR, ERRCODE_INVALID_FUNCTION_DEFINITION, "'DELETE' cannot be used within a function", getLineAndPos(ctx->delete_statement()->delete_statement_from()->ddl_object()));
}

/* we must add previous rewrite at first. */
add_rewritten_query_fragment_to_mutator(statementMutator.get());

// post-processing of execsql stmt query
for (auto &entry : local_id_positions)
{
Expand All @@ -1552,9 +1553,6 @@ class tsqlBuilder : public tsqlCommonMutator
statementMutator->add(entry.first, entry.second, quoted_local_id);
}

/* common routine for select and non-select */
add_rewritten_query_fragment_to_mutator(statementMutator.get());

/* Add query hints */
if (query_hints.size() && enable_hint_mapping)
{
Expand Down Expand Up @@ -2889,6 +2887,7 @@ rewriteBatchLevelStatement(
// Run select statement mutator
antlr4::tree::ParseTreeWalker walker;
walker.walk(ssm, ctx);
add_rewritten_query_fragment_to_mutator(&mutator);

mutator.run();
ssm->mutator = nullptr;
Expand Down Expand Up @@ -3456,18 +3455,17 @@ void replaceTokenStringFromQuery(PLtsql_expr* expr, Token* startToken, Token* en
throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));

size_t baseIdx = baseCtx->getStart()->getStartIndex();
if (endIdx == INVALID_INDEX)
if (baseIdx == INVALID_INDEX)
throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));

// repl string is too long. we cannot replace with it in place.
if (repl && strlen(repl) > endIdx - startIdx + 1)
throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(baseCtx));

Assert(expr->query);
memset(expr->query + startIdx - baseIdx, ' ', endIdx - startIdx + 1);

if (repl)
memcpy(expr->query + startIdx - baseIdx, repl, strlen(repl));
/* store and rewrite instead of in-place rewrite */
rewritten_query_fragment.emplace(std::make_pair(startIdx, std::make_pair(startToken->getInputStream()->getText(misc::Interval(startIdx, endIdx)), repl ? std::string(repl) : std::string(endIdx - startIdx + 1, ' '))));
}

void replaceTokenStringFromQuery(PLtsql_expr* expr, TerminalNode* tokenNode, const char * repl, ParserRuleContext *baseCtx)
Expand Down Expand Up @@ -3911,12 +3909,16 @@ makeReturnQueryStmt(TSqlParser::Select_statement_standaloneContext *ctx, bool it
if (base_index == INVALID_INDEX)
throw PGErrorWrapperException(ERROR, ERRCODE_SYNTAX_ERROR, "can't generate an internal query", getLineAndPos(ctx));

auto *query = itvf_expr->query;
/* we must add previous rewrite at first. */
add_rewritten_query_fragment_to_mutator(&itvf_mutator);

std::u32string query = utf8_to_utf32(itvf_expr->query);
for (const auto &entry : local_id_positions)
{
const std::string& local_id = entry.second;
const std::u32string& local_id_u32 = utf8_to_utf32(local_id.c_str());
size_t offset = entry.first - base_index;
if (strncmp(local_id.c_str(), query+offset, local_id.length()) == 0) // local_id maybe already deleted in some cases such as select-assignment. check here if it still exists)
if (query.substr(offset, local_id_u32.length()) == local_id_u32) // local_id maybe already deleted in some cases such as select-assignment. check here if it still exists)
{
int dno;
PLtsql_nsitem *nse = pltsql_ns_lookup(pltsql_ns_top(), false, local_id.c_str(), nullptr, nullptr, nullptr);
Expand All @@ -3930,7 +3932,6 @@ makeReturnQueryStmt(TSqlParser::Select_statement_standaloneContext *ctx, bool it
itvf_mutator.add(entry.first, entry.second, repl_text);
}
}
add_rewritten_query_fragment_to_mutator(&itvf_mutator);
itvf_mutator.run();
result->query->itvf_query = itvf_expr->query;
}
Expand Down
46 changes: 46 additions & 0 deletions test/JDBC/expected/babel_unicode_charset.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
drop table if exists unicode_test;
go
create table unicode_test(col nvarchar(255), 中文列名 nvarchar(255));
go
insert into unicode_test values('Hello', '你好');
go
~~ROW COUNT: 1~~

insert into unicode_test values('World', '世界');
go
~~ROW COUNT: 1~~


/* multibyte characters as identifier */
select col 别名 from unicode_test;
go
~~START~~
nvarchar
Hello
World
~~END~~

select 别名=col from unicode_test;
go
~~START~~
nvarchar
Hello
World
~~END~~


/* multibyte characters with unsupported token */
select "你好世界" from unicode_test with(nolock);
go
~~ERROR (Code: 33557097)~~

~~ERROR (Message: column "你好世界" does not exist)~~

select 中文列名 from unicode_test with(nolock);
go
~~START~~
nvarchar
你好
世界
~~END~~

20 changes: 20 additions & 0 deletions test/JDBC/input/babel_unicode_charset.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
drop table if exists unicode_test;
go
create table unicode_test(col nvarchar(255), 中文列名 nvarchar(255));
go
insert into unicode_test values('Hello', '你好');
go
insert into unicode_test values('World', '世界');
go

/* multibyte characters as identifier */
select col 别名 from unicode_test;
go
select 别名=col from unicode_test;
go

/* multibyte characters with unsupported token */
select "你好世界" from unicode_test with(nolock);
go
select 中文列名 from unicode_test with(nolock);
go

0 comments on commit 7c4cbad

Please sign in to comment.