-
-
Notifications
You must be signed in to change notification settings - Fork 34.1k
Open
Labels
type-bugAn unexpected behavior, bug, or errorAn unexpected behavior, bug, or error
Description
Bug report
Bug description:
In the following initialization:
./configure --with-pydebug --with-undefined-behavior-sanitizer
tok->tok_mode_stack[0] = (tokenizer_mode){
.kind = TOK_REGULAR_MODE,
.quote = '\0',
.quote_size = 0,
.in_debug = 0
};the fields start and multi_line_start are not explicitly initialized.
Although PyMem_Calloc zero-initializes the allocated memory (so both pointers are set to NULL), this still leads to a serious issue later.
When _PyLexer_remember_fstring_buffers executes:
mode->start_offset = mode->start - tok->buf;we end up performing pointer arithmetic like NULL - valid_pointer This is undefined behavior (UB) in C.
Self-contained reproducer (test_lexer_overflow.py)
Click to expand full reproducer code
import subprocess
import sys
import os
import tempfile
def test_interactive_long_fstring():
"""
Pipe a long f-string into python -i (interactive mode).
The interactive tokenizer uses tok_underflow_interactive which calls
_PyLexer_tok_reserve_buf -> buffer realloc -> pointer overflow.
"""
padding = 'A' * 200000
code = f'x = 42\nresult = f"{{x}}{padding}"\nprint("ok")\n'
result = subprocess.run(
[sys.executable, '-i'],
input=code,
capture_output=True,
text=True,
timeout=30,
)
return result.stdout, result.stderr, result.returncode
def test_interactive_long_line():
"""
Even a regular long string in interactive mode triggers the bug,
since tok_mode_stack[0].start is NULL regardless of f-string usage.
"""
padding = 'B' * 200000
code = f'x = "{padding}"\nprint("ok")\n'
result = subprocess.run(
[sys.executable, '-i'],
input=code,
capture_output=True,
text=True,
timeout=30,
)
return result.stdout, result.stderr, result.returncode
def test_interactive_nested_fstring():
"""
Nested f-strings increase tok_mode_stack_index, exercising more
uninitialized entries in the mode stack.
"""
padding = 'C' * 200000
inner = '{f"{x}"}'
code = f'x = 1\nresult = f"{inner}{padding}"\nprint("ok")\n'
result = subprocess.run(
[sys.executable, '-i'],
input=code,
capture_output=True,
text=True,
timeout=30,
)
return result.stdout, result.stderr, result.returncode
def test_file_long_fstring():
"""
File-based tokenization with a long f-string.
Uses tok_underflow_file -> tok_readline_raw -> _PyLexer_tok_reserve_buf.
"""
padding = 'D' * 200000
code = f'x = 42\nresult = f"{{x}}{padding}"\nprint("ok")\n'
fd, tmpfile = tempfile.mkstemp(suffix='.py')
try:
with os.fdopen(fd, 'w') as f:
f.write(code)
result = subprocess.run(
[sys.executable, tmpfile],
capture_output=True,
text=True,
timeout=30,
)
return result.stdout, result.stderr, result.returncode
finally:
os.unlink(tmpfile)
def main():
print(f"Python: {sys.version}")
print(f"Executable: {sys.executable}")
print()
tests = [
("Interactive: long f-string via stdin", test_interactive_long_fstring),
("Interactive: long regular string via stdin", test_interactive_long_line),
("Interactive: nested f-string via stdin", test_interactive_nested_fstring),
("File: long f-string in .py file", test_file_long_fstring),
]
bug_found = False
for name, test_func in tests:
print(f"Test: {name}")
print("-" * 60)
try:
stdout, stderr, returncode = test_func()
except subprocess.TimeoutExpired:
print(" TIMEOUT\n")
continue
except Exception as e:
print(f" ERROR: {e}\n")
continue
has_overflow = "pointer index expression" in stderr and "overflowed" in stderr
has_buffer_c = "buffer.c" in stderr
if has_overflow and has_buffer_c:
print(f" BUG FOUND! Pointer overflow in buffer.c")
# Show relevant UBSan lines
for line in stderr.split('\n'):
if 'buffer.c' in line and 'runtime error' in line:
print(f" {line.strip()}")
bug_found = True
elif stderr.strip():
# Filter out normal interactive prompt noise
relevant = [l for l in stderr.split('\n')
if l.strip() and not l.startswith('>>>') and not l.startswith('...')
and 'Type "help"' not in l and 'Python 3' not in l]
if relevant:
print(f" stderr: {'; '.join(relevant)[:300]}")
else:
print(f" No UBSan output")
else:
print(f" No UBSan output (build may not have UBSan enabled)")
if "ok" in (stdout or ""):
print(f" Output: ok (ran successfully despite UB)")
print(f" Return code: {returncode}")
print()
print("=" * 60)
if bug_found:
print("CONFIRMED: Pointer overflow in Parser/lexer/buffer.c:30-31")
print("=" * 60)
print()
print("Root cause:")
print(" tok_mode_stack[0].start and .multi_line_start are NULL.")
print(" _PyLexer_remember_fstring_buffers() computes NULL - tok->buf")
print(" which is undefined behavior (pointer arithmetic on NULL).")
print(" _PyLexer_restore_fstring_buffers() then does")
print(" tok->buf + garbage_offset which causes pointer overflow.")
print()
print("Affected code:")
print(" Parser/lexer/buffer.c:16 mode->start_offset = mode->start - tok->buf")
print(" Parser/lexer/buffer.c:17 mode->multi_line_start_offset = mode->multi_line_start - tok->buf")
print(" Parser/lexer/buffer.c:30 mode->start = tok->buf + mode->start_offset")
print(" Parser/lexer/buffer.c:31 mode->multi_line_start = tok->buf + mode->multi_line_start_offset")
print()
print("Suggested fix: Add NULL checks in _PyLexer_remember_fstring_buffers(),")
print("similar to how _PyLexer_tok_reserve_buf() checks tok->start for NULL.")
else:
print("No UBSan errors detected.")
print("=" * 60)
print("Make sure CPython is built with UBSan:")
print(" ./configure --with-pydebug --with-undefined-behavior-sanitizer")
return 0 if bug_found else 1
if __name__ == "__main__":
sys.exit(main())Output :
❯ ./python test_lexer_overflow.py
Python: 3.15.0a6+ (heads/main:eb6ebdb, Feb 12 2026, 19:38:45) [GCC 14.2.0]
Executable: /home/raminfp/cpython/python
Test: Interactive: long f-string via stdin
------------------------------------------------------------
BUG FOUND! Pointer overflow in buffer.c
>>> Parser/lexer/buffer.c:30:32: runtime error: pointer index expression with base 0x50300007f130 overflowed to 0xfffffddfffc38020
Parser/lexer/buffer.c:31:43: runtime error: pointer index expression with base 0x50300007f130 overflowed to 0xfffffddfffc38020
Output: ok (ran successfully despite UB)
Return code: 0
Test: Interactive: long regular string via stdin
------------------------------------------------------------
BUG FOUND! Pointer overflow in buffer.c
>>> >>> Parser/lexer/buffer.c:30:32: runtime error: pointer index expression with base 0x504000020c60 overflowed to 0xfffffdefffbd4b50
Parser/lexer/buffer.c:31:43: runtime error: pointer index expression with base 0x504000020c60 overflowed to 0xfffffdefffbd4b50
Output: ok (ran successfully despite UB)
Return code: 0Suggested fix:
Parser/lexer/buffer.c
13
14 for (index = tok->tok_mode_stack_index; index >= 0; --index) {
15 mode = &(tok->tok_mode_stack[index]);
16 - mode->start_offset = mode->start - tok->buf;
17 - mode->multi_line_start_offset = mode->multi_line_start - tok->buf;
16 + mode->start_offset = mode->start == NULL ? -1 : mode->start - tok->buf;
17 + mode->multi_line_start_offset = mode->multi_line_start == NULL ? -1 : mode->multi_line_st
+art - tok->buf;
18 }
19 }
20
...
27
28 for (index = tok->tok_mode_stack_index; index >= 0; --index) {
29 mode = &(tok->tok_mode_stack[index]);
30 - mode->start = tok->buf + mode->start_offset;
31 - mode->multi_line_start = tok->buf + mode->multi_line_start_offset;
30 + mode->start = mode->start_offset < 0 ? NULL : tok->buf + mode->start_offset;
31 + mode->multi_line_start = mode->multi_line_start_offset < 0 ? NULL : tok->buf + mode->mult
+i_line_start_offset;
32 }
33 }
34 CPython versions tested on:
CPython main branch
Operating systems tested on:
Linux
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
type-bugAn unexpected behavior, bug, or errorAn unexpected behavior, bug, or error