Skip to content

Uninitialized start and multi_line_start Causing Undefined Behavior - Pointer overflow #144759

@raminfp

Description

@raminfp

Bug report

Bug description:

In the following initialization:

./configure --with-pydebug --with-undefined-behavior-sanitizer

tok->tok_mode_stack[0] = (tokenizer_mode){
    .kind = TOK_REGULAR_MODE,
    .quote = '\0',
    .quote_size = 0,
    .in_debug = 0
};

the fields start and multi_line_start are not explicitly initialized.
Although PyMem_Calloc zero-initializes the allocated memory (so both pointers are set to NULL), this still leads to a serious issue later.

When _PyLexer_remember_fstring_buffers executes:

mode->start_offset = mode->start - tok->buf;

we end up performing pointer arithmetic like NULL - valid_pointer This is undefined behavior (UB) in C.

Self-contained reproducer (test_lexer_overflow.py)

Click to expand full reproducer code
import subprocess
import sys
import os
import tempfile


def test_interactive_long_fstring():
    """
    Pipe a long f-string into python -i (interactive mode).
    The interactive tokenizer uses tok_underflow_interactive which calls
    _PyLexer_tok_reserve_buf -> buffer realloc -> pointer overflow.
    """
    padding = 'A' * 200000
    code = f'x = 42\nresult = f"{{x}}{padding}"\nprint("ok")\n'

    result = subprocess.run(
        [sys.executable, '-i'],
        input=code,
        capture_output=True,
        text=True,
        timeout=30,
    )
    return result.stdout, result.stderr, result.returncode


def test_interactive_long_line():
    """
    Even a regular long string in interactive mode triggers the bug,
    since tok_mode_stack[0].start is NULL regardless of f-string usage.
    """
    padding = 'B' * 200000
    code = f'x = "{padding}"\nprint("ok")\n'

    result = subprocess.run(
        [sys.executable, '-i'],
        input=code,
        capture_output=True,
        text=True,
        timeout=30,
    )
    return result.stdout, result.stderr, result.returncode


def test_interactive_nested_fstring():
    """
    Nested f-strings increase tok_mode_stack_index, exercising more
    uninitialized entries in the mode stack.
    """
    padding = 'C' * 200000
    inner = '{f"{x}"}'
    code = f'x = 1\nresult = f"{inner}{padding}"\nprint("ok")\n'

    result = subprocess.run(
        [sys.executable, '-i'],
        input=code,
        capture_output=True,
        text=True,
        timeout=30,
    )
    return result.stdout, result.stderr, result.returncode


def test_file_long_fstring():
    """
    File-based tokenization with a long f-string.
    Uses tok_underflow_file -> tok_readline_raw -> _PyLexer_tok_reserve_buf.
    """
    padding = 'D' * 200000
    code = f'x = 42\nresult = f"{{x}}{padding}"\nprint("ok")\n'

    fd, tmpfile = tempfile.mkstemp(suffix='.py')
    try:
        with os.fdopen(fd, 'w') as f:
            f.write(code)

        result = subprocess.run(
            [sys.executable, tmpfile],
            capture_output=True,
            text=True,
            timeout=30,
        )
        return result.stdout, result.stderr, result.returncode
    finally:
        os.unlink(tmpfile)


def main():
    print(f"Python: {sys.version}")
    print(f"Executable: {sys.executable}")
    print()

    tests = [
        ("Interactive: long f-string via stdin", test_interactive_long_fstring),
        ("Interactive: long regular string via stdin", test_interactive_long_line),
        ("Interactive: nested f-string via stdin", test_interactive_nested_fstring),
        ("File: long f-string in .py file", test_file_long_fstring),
    ]

    bug_found = False

    for name, test_func in tests:
        print(f"Test: {name}")
        print("-" * 60)

        try:
            stdout, stderr, returncode = test_func()
        except subprocess.TimeoutExpired:
            print("  TIMEOUT\n")
            continue
        except Exception as e:
            print(f"  ERROR: {e}\n")
            continue

        has_overflow = "pointer index expression" in stderr and "overflowed" in stderr
        has_buffer_c = "buffer.c" in stderr

        if has_overflow and has_buffer_c:
            print(f"  BUG FOUND! Pointer overflow in buffer.c")
            # Show relevant UBSan lines
            for line in stderr.split('\n'):
                if 'buffer.c' in line and 'runtime error' in line:
                    print(f"  {line.strip()}")
            bug_found = True
        elif stderr.strip():
            # Filter out normal interactive prompt noise
            relevant = [l for l in stderr.split('\n')
                        if l.strip() and not l.startswith('>>>') and not l.startswith('...')
                        and 'Type "help"' not in l and 'Python 3' not in l]
            if relevant:
                print(f"  stderr: {'; '.join(relevant)[:300]}")
            else:
                print(f"  No UBSan output")
        else:
            print(f"  No UBSan output (build may not have UBSan enabled)")

        if "ok" in (stdout or ""):
            print(f"  Output: ok (ran successfully despite UB)")

        print(f"  Return code: {returncode}")
        print()

    print("=" * 60)
    if bug_found:
        print("CONFIRMED: Pointer overflow in Parser/lexer/buffer.c:30-31")
        print("=" * 60)
        print()
        print("Root cause:")
        print("  tok_mode_stack[0].start and .multi_line_start are NULL.")
        print("  _PyLexer_remember_fstring_buffers() computes NULL - tok->buf")
        print("  which is undefined behavior (pointer arithmetic on NULL).")
        print("  _PyLexer_restore_fstring_buffers() then does")
        print("  tok->buf + garbage_offset which causes pointer overflow.")
        print()
        print("Affected code:")
        print("  Parser/lexer/buffer.c:16  mode->start_offset = mode->start - tok->buf")
        print("  Parser/lexer/buffer.c:17  mode->multi_line_start_offset = mode->multi_line_start - tok->buf")
        print("  Parser/lexer/buffer.c:30  mode->start = tok->buf + mode->start_offset")
        print("  Parser/lexer/buffer.c:31  mode->multi_line_start = tok->buf + mode->multi_line_start_offset")
        print()
        print("Suggested fix: Add NULL checks in _PyLexer_remember_fstring_buffers(),")
        print("similar to how _PyLexer_tok_reserve_buf() checks tok->start for NULL.")
    else:
        print("No UBSan errors detected.")
        print("=" * 60)
        print("Make sure CPython is built with UBSan:")
        print("  ./configure --with-pydebug --with-undefined-behavior-sanitizer")

    return 0 if bug_found else 1


if __name__ == "__main__":
    sys.exit(main())

Output :

❯ ./python test_lexer_overflow.py
Python: 3.15.0a6+ (heads/main:eb6ebdb, Feb 12 2026, 19:38:45) [GCC 14.2.0]
Executable: /home/raminfp/cpython/python

Test: Interactive: long f-string via stdin
------------------------------------------------------------
  BUG FOUND! Pointer overflow in buffer.c
  >>> Parser/lexer/buffer.c:30:32: runtime error: pointer index expression with base 0x50300007f130 overflowed to 0xfffffddfffc38020
  Parser/lexer/buffer.c:31:43: runtime error: pointer index expression with base 0x50300007f130 overflowed to 0xfffffddfffc38020
  Output: ok (ran successfully despite UB)
  Return code: 0

Test: Interactive: long regular string via stdin
------------------------------------------------------------
  BUG FOUND! Pointer overflow in buffer.c
  >>> >>> Parser/lexer/buffer.c:30:32: runtime error: pointer index expression with base 0x504000020c60 overflowed to 0xfffffdefffbd4b50
  Parser/lexer/buffer.c:31:43: runtime error: pointer index expression with base 0x504000020c60 overflowed to 0xfffffdefffbd4b50
  Output: ok (ran successfully despite UB)
  Return code: 0

Suggested fix:

Parser/lexer/buffer.c                                                                                                                    
      13                                                                                                          
      14      for (index = tok->tok_mode_stack_index; index >= 0; --index) {                                      
      15          mode = &(tok->tok_mode_stack[index]);
      16 -        mode->start_offset = mode->start - tok->buf;                                                    
      17 -        mode->multi_line_start_offset = mode->multi_line_start - tok->buf;                       
      16 +        mode->start_offset = mode->start == NULL ? -1 : mode->start - tok->buf;                  
      17 +        mode->multi_line_start_offset = mode->multi_line_start == NULL ? -1 : mode->multi_line_st
         +art - tok->buf;                                                                                  
      18      }
      19  }
      20  
     ...
      27  
      28      for (index = tok->tok_mode_stack_index; index >= 0; --index) {
      29          mode = &(tok->tok_mode_stack[index]);
      30 -        mode->start = tok->buf + mode->start_offset;                                             
      31 -        mode->multi_line_start = tok->buf + mode->multi_line_start_offset;                       
      30 +        mode->start = mode->start_offset < 0 ? NULL : tok->buf + mode->start_offset;             
      31 +        mode->multi_line_start = mode->multi_line_start_offset < 0 ? NULL : tok->buf + mode->mult
         +i_line_start_offset;                                                                             
      32      }
      33  }
      34  

CPython versions tested on:

CPython main branch

Operating systems tested on:

Linux

Metadata

Metadata

Assignees

No one assigned

    Labels

    type-bugAn unexpected behavior, bug, or error

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions