Skip to content

Commit c8fa172

Browse files
author
H. Peter Anvin (Intel)
committed
Add support for C2y-style \o and braced escapes, and \d for decimal
C2y adds \o for octal byte escapes, and allows \x, \o, or \u escape sequences to be enclosed in braces to indicate their termination. Add a \d sequence as a NASM extension to allow decimal bytes to be specified. Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
1 parent b1909cf commit c8fa172

File tree

5 files changed

+139
-62
lines changed

5 files changed

+139
-62
lines changed

asm/quote.c

Lines changed: 79 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -269,54 +269,53 @@ char *nasm_quote_cstr(const char *str, size_t *lenp)
269269
* to indicate the lead marker of a quoted string. If it is '\"', then
270270
* '`' is not a special character at all.
271271
*/
272+
enum unq_state {
273+
st_start,
274+
st_backslash,
275+
st_byte, /* Byte numeric sequence */
276+
st_ucs, /* \u or \U */
277+
st_done
278+
};
272279

273280
size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
274281
const char qstart)
275282
{
276-
unsigned char bq;
277-
const unsigned char *p;
278-
const unsigned char *escp = NULL;
279-
unsigned char *q;
283+
const unsigned char bq = *str;
284+
const unsigned char *p = (unsigned char *)str;
285+
unsigned char *q = (unsigned char *)str;
280286
unsigned char c;
281287
uint32_t ctlmask = 0; /* Mask of control characters seen */
282-
enum unq_state {
283-
st_start,
284-
st_backslash,
285-
st_hex,
286-
st_oct,
287-
st_ucs,
288-
st_done
289-
} state;
290-
int ndig = 0;
291-
uint32_t nval = 0;
292-
293-
p = q = (unsigned char *)str;
294-
295-
bq = *p++;
296-
if (!bq)
297-
return 0;
298288

299289
if (bq == (unsigned char)qstart) {
300-
/* `...` string */
301-
state = st_start;
290+
/* `...` string or "..." with C unquoting */
291+
enum unq_state state = st_start;
292+
unsigned int base = 0; /* Base of numeric escape sequence */
293+
uint64_t nval = 0; /* Accumulated value of numeric sequence */
294+
unsigned int v;
295+
int ndig = 0; /* Max digits of numeric sequence */
296+
/* ndig < 0 means braced sequence */
297+
const unsigned char *escp = NULL; /* Pointer to immediately after \ */
298+
299+
p++; /* Skip initial quote */
302300

303301
while (state != st_done) {
304302
c = *p++;
305303
switch (state) {
306304
case st_start:
307305
if (c == '\\') {
308306
state = st_backslash;
309-
} else if ((c == '\0') | (c == bq)) {
307+
} else if (c == '\0' || c == bq) {
310308
state = st_done;
311309
} else {
312310
EMIT(c);
313311
}
314-
break;
312+
break;
315313

316314
case st_backslash:
317315
state = st_start;
318-
escp = p; /* Beginning of argument sequence */
316+
escp = p-1;
319317
nval = 0;
318+
320319
switch (c) {
321320
case 'a':
322321
nval = 7;
@@ -341,20 +340,28 @@ size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
341340
break;
342341
case 'u':
343342
state = st_ucs;
343+
base = 16;
344344
ndig = 4;
345-
break;
345+
goto check_brace;
346346
case 'U':
347347
state = st_ucs;
348+
base = 16;
348349
ndig = 8;
349-
break;
350+
goto check_brace;
350351
case 'v':
351352
nval = 11;
352353
break;
353354
case 'x':
354355
case 'X':
355-
state = st_hex;
356+
state = st_byte;
357+
base = 16;
356358
ndig = 2;
357-
break;
359+
goto check_brace;
360+
case 'd': /* NASM extension: \d = decimal */
361+
state = st_byte;
362+
base = 10;
363+
ndig = 3;
364+
goto check_brace;
358365
case '0':
359366
case '1':
360367
case '2':
@@ -363,10 +370,19 @@ size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
363370
case '5':
364371
case '6':
365372
case '7':
366-
state = st_oct;
367-
ndig = 2; /* Up to two more digits */
368-
nval = c - '0';
369-
break;
373+
/* Back up both p and escp, as if there had been an "o" */
374+
p = escp--;
375+
/* fall through */
376+
case 'o':
377+
state = st_byte;
378+
ndig = 3;
379+
base = 8;
380+
check_brace: /* Is this the start of a braced sequence? */
381+
if (*p == '{') {
382+
p++; /* Skip brace */
383+
ndig = -1;
384+
}
385+
break;
370386
case '\0':
371387
nval = '\\';
372388
p--; /* Reprocess; terminates string */
@@ -379,38 +395,37 @@ size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
379395
EMIT(nval);
380396
break;
381397

382-
case st_oct:
383-
if (c >= '0' && c <= '7') {
384-
nval = (nval << 3) + (c - '0');
385-
if (--ndig)
386-
break; /* Might have more digits */
387-
} else {
388-
p--; /* Process this character again */
389-
}
390-
EMIT(nval);
391-
state = st_start;
392-
break;
393-
394-
case st_hex:
398+
case st_byte:
395399
case st_ucs:
396-
if (nasm_isxdigit(c)) {
397-
nval = (nval << 4) + numvalue(c);
400+
if ((v = numvalue_chk(c)) < base) {
401+
nval = (nval * base) + v;
398402
if (--ndig)
399-
break; /* Might have more digits */
403+
break; /* Continue processing number, no output */
404+
} else if (ndig < 0) {
405+
/* End of braced sequence */
406+
if (unlikely(c != '}'))
407+
goto rewind;
400408
} else {
401-
p--; /* Process this character again */
409+
p--; /* Reprocess terminating character */
410+
if (unlikely(p == escp+1))
411+
goto rewind; /* No digits at all received */
402412
}
403413

404-
if (unlikely(p <= escp))
405-
EMIT(escp[-1]);
406-
else if (state == st_ucs)
414+
/* Emit the output */
415+
if (state == st_ucs)
407416
EMIT_UTF8(nval);
408417
else
409418
EMIT(nval);
410419

411420
state = st_start;
412421
break;
413422

423+
/* Rewind an entire sequence as invalid */
424+
rewind:
425+
p = escp; /* Start over at character following \ */
426+
state = st_start;
427+
break;
428+
414429
default:
415430
panic();
416431
}
@@ -421,10 +436,11 @@ size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
421436
* * any kind, including collapsing double quote marks.)
422437
* We obviously can't get here if qstart == '\"'.
423438
*/
424-
while ((c = *p++) && (c != bq))
439+
p++; /* Skip initial quote */
440+
while ((c = *p++) && c != bq)
425441
EMIT(c);
426442
} else {
427-
/* Not a quoted string, just return the input... */
443+
/* Not a quoted string, just return the input */
428444
while ((c = *p++))
429445
EMIT(c);
430446
}
@@ -435,8 +451,11 @@ size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
435451
if (ctlmask & badctl)
436452
nasm_nonfatal("control character in string not allowed here");
437453

438-
if (ep)
454+
if (ep) {
455+
/* Point at the terminating character */
439456
*ep = (char *)p - 1;
457+
}
458+
440459
return (char *)q - str;
441460
}
442461
#undef EMIT
@@ -471,11 +490,7 @@ char *nasm_skip_string(const char *str)
471490
char bq;
472491
const char *p;
473492
char c;
474-
enum unq_state {
475-
st_start,
476-
st_backslash,
477-
st_done
478-
} state;
493+
enum unq_state state;
479494

480495
bq = str[0];
481496
p = str+1;
@@ -515,6 +530,9 @@ char *nasm_skip_string(const char *str)
515530
* a backquote will force a return to the st_start state,
516531
* and any possible multi-character state will terminate
517532
* for any non-alphanumeric character.
533+
*
534+
* The only reason this is needed at all is to detect
535+
* the \` sequence.
518536
*/
519537
state = c ? st_start : st_done;
520538
break;

doc/changes.src

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ It is the production version of NASM since 2025.
9292
size of the immediate explicitly (e.g. \c{JMP DWORD label}) has
9393
always worked correctly, however.
9494

95+
\b Add support for C2y-style \c{\\o} escape sequences, braced escape
96+
sequences, and as a NASM extension, decimal escape sequences
97+
(\c{\\d}). See \k{strings}.
98+
99+
95100
\S{cl-3.01} Version 3.01
96101

97102
\b A new \c{obj2} version of the \c{obj} output format, intended for

doc/lang.src

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,6 @@ quotes allows double quotes to appear within it and vice versa); the
442442
contents of those are represented verbatim. Strings enclosed in
443443
backquotes support C-style \c{\\}-escapes for special characters.
444444

445-
446445
The following \i{escape sequences} are recognized by backquoted strings:
447446

448447
\c \' single quote (')
@@ -463,6 +462,22 @@ The following \i{escape sequences} are recognized by backquoted strings:
463462
\c \u1234 4 hexadecimal digits - Unicode character
464463
\c \U12345678 8 hexadecimal digits - Unicode character
465464

465+
NASM 3.02 added the following additional sequences:
466+
467+
\c \o377 Up to 3 octal digits - literal byte (from C2y)
468+
\c \d255 Up to 3 decimal digits - literal byte (NASM extension)
469+
470+
Since NASM 3.02, the numeric escape sequences starting with \c{\\x},
471+
\c{\\o}, \c{\\d} or \c{\\u} can have their argument enclosed in curly
472+
braces to delimit their length, instead of terminating after a certain
473+
number of digits or a non-digit another character:
474+
475+
\c \x{FF} Hexadecimal, literal byte
476+
\c \o{377} Octal, literal byte
477+
\c \d{255} Decimal, literal byte
478+
\c \u{1234} Unicode character
479+
\c \u{12345} Unicode character
480+
466481
All other escape sequences are reserved. Note that \c{\\0}, meaning a
467482
\c{NUL} character (ASCII 0), is a special case of the octal escape
468483
sequence.
@@ -471,6 +486,8 @@ sequence.
471486
\i{UTF-8}. For example, the following lines are all equivalent:
472487

473488
\c db `\u263a` ; UTF-8 smiley face
489+
\c db `\U0000263a` ; UTF-8 smiley face
490+
\c db `\u{263a}` ; UTF-8 smiley face
474491
\c db `\xe2\x98\xba` ; UTF-8 smiley face
475492
\c db 0E2h, 098h, 0BAh ; UTF-8 smiley face
476493

include/nasmlib.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,22 @@ static inline unsigned int numvalue(unsigned char c)
273273
return c >= 'a' ? c - 'a' + 10 : c - '0';
274274
}
275275

276+
/* The same except returns -1U for non-digits, so it can be directly
277+
* compared against the base used to test for validity. */
278+
static inline unsigned int numvalue_chk(unsigned char c)
279+
{
280+
unsigned int v;
281+
v = c - '0';
282+
if (v < 10)
283+
return v;
284+
285+
v = (c | 0x20) - 'a';
286+
if (v < 26)
287+
return v + 10;
288+
289+
return -1U;
290+
}
291+
276292
/*
277293
* Convert a string into a number, using NASM number rules. Sets
278294
* `*error' to true if an error occurs, and false otherwise.

test/bracedquote.asm

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
db "Have a day!!", 13, 10
2+
db 13, 10
3+
;; All these should produce U+1F610 in UTF-8
4+
db `\U0001f610`
5+
db `\u{1f610}`
6+
db `\u{0001f610}`
7+
db `\x{f0}\x{9f}\x{98}\x{90}`
8+
db `\xf0\x9f\x98\x90`
9+
db `\d240\d159\d152\d144`
10+
db `\d{240}\d{0159}\d{152}\d{144}`
11+
db `\360\237\230\220`
12+
db `\o{360}\o{0237}\o{230}\o{00220}`
13+
db `\o360\o237\o230\o220`
14+
15+
db `\U0001F610`
16+
db `\u{1F610}`
17+
db `\u{0001F610}`
18+
db `\x{F0}\x{9F}\x{98}\x{90}`
19+
db `\xF0\x9F\x98\x90`
20+
21+
db `\r\n`

0 commit comments

Comments
 (0)