6

I would like to parse Python code that contains semicolons ; for separating commands and produce code that replaces those by newlines \n. E.g., from

def main():
    a = "a;b"; return a

I'd like to produce

def main():
    a = "a;b"
    return a

Any hints?

Nico Schlömer
  • 53,797
  • 27
  • 201
  • 249

2 Answers2

4

Use the tokenize library to look for token.OP tokens, where the second element is a ; *. Replace these tokens with a token.NEWLINE token.

You'd need to adjust your token offsets and generate matching indent too however; so after a NEWLINE you'd need to adjust line numbers (increment by an offset you increase for every NEWLINE you insert) and the 'next' line (remainder of the current line) would have to have the indices adjusted to match the current indentation level:

import tokenize

TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a)  # Python 3 compat

def semicolon_to_newline(tokens):
    line_offset = 0
    last_indent = None
    col_offset = None  # None or an integer
    for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
        slno, elno = slno + line_offset, elno + line_offset
        if ttype in (tokenize.INDENT, tokenize.DEDENT):
            last_indent = ecol  # block is indented to this column
        elif ttype == tokenize.OP and tstr == ';':
            # swap out semicolon with a newline
            ttype = tokenize.NEWLINE
            tstr = '\n'
            line_offset += 1
            if col_offset is not None:
                scol, ecol = scol - col_offset, ecol - col_offset
            col_offset = 0  # next tokens should start at the current indent
        elif col_offset is not None:
            if not col_offset:
                # adjust column by starting column of next token
                col_offset = scol - last_indent
            scol, ecol = scol - col_offset, ecol - col_offset
            if ttype == tokenize.NEWLINE:
                col_offset = None
        yield TokenInfo(
            ttype, tstr, (slno, scol), (elno, ecol), line)

with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
    generator = tokenize.generate_tokens(source.readline)
    dest.write(tokenize.untokenize(semicolon_to_newline(generator)))

Note that I don't bother to correct the line value; it is informative only, the data that was read from the file is not actually used when un-tokenizing.

Demo:

>>> from io import StringIO
>>> source = StringIO('''\
... def main():
...     a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
    a = "a;b"
    return a

and slightly more complex:

>>> source = StringIO('''\
... class Foo(object):
...     def bar(self):
...         a = 10; b = 11; c = 12
...         if self.spam:
...             x = 12; return x
...         x = 15; return y
...
...     def baz(self):
...         return self.bar;
...         # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
    def bar(self):
        a = 10
        b = 11
        c = 12
        if self.spam:
            x = 12
            return x
        x = 15
        return y

    def baz(self):
        return self.bar

        # note, nothing after the semicolon

>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y

....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon

* The Python 3 version of tokenize outputs more informative TokenInfo named tuples, which have an extra exact_type attribute that can be used instead of doing a text match: tok.exact_type == tokenize.SEMI. I kept the above compatible with Python 2 and 3 however.

Martijn Pieters
  • 1,048,767
  • 296
  • 4,058
  • 3,343
1

Here's a pyparsing solution - see comments in the code below:

from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line

SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)

def split_at(s, locs):
    """
    break up s into pieces, given list of break locations
    """
    current = 0
    ret = []
    for loc in locs:
        ret.append(s[current:loc].lstrip())
        current = loc+1
    ret.append(s[current:].lstrip())
    return ret

def split_on_semicolon(s,l,tokens):
    """
    parse time callback, when finding first unquoted ';' on a line
    """
    current_line = line(l,s)
    line_body = current_line.lstrip()
    indent = current_line.index(line_body)
    indent = current_line[:indent]

    # may be more than one ';' on this line, find them all
    # (the second token contains everything after the ';')
    remainder = tokens[1]
    if remainder.strip():
        all_semis = [s for _,s,_ in SEMI.scanString(remainder)]

        # break line into pieces
        pieces = split_at(remainder, all_semis)

        # rejoin pieces, with leading indents
        return '\n'+'\n'.join(indent+piece for piece in pieces)
    else:
        return ''

patt.addParseAction(split_on_semicolon)

sample = """
def main():
    this_semi_does_nothing();
    neither_does_this_but_there_are_spaces_afterward();   
    a = "a;b"; return a # this is a comment; it has a semicolon!

def b():
    if False:
        z=1000;b("; in quotes");  c=200;return z
    return ';'

class Foo(object):
    def bar(self):
        '''a docstring; with a semicolon'''
        a = 10; b = 11; c = 12

        # this comment; has several; semicolons
        if self.spam:
            x = 12; return x # so; does; this; one
        x = 15;;; y += x; return y

    def baz(self):
        return self.bar
"""
print(patt.transformString(sample))

Gives:

def main():
    this_semi_does_nothing()
    neither_does_this_but_there_are_spaces_afterward()
    a = "a;b"
    return a # this is a comment; it has a semicolon!

def b():
    if False:
        z=1000
        b("; in quotes")
        c=200
        return z
    return ';'

class Foo(object):
    def bar(self):
        '''a docstring; with a semicolon'''
        a = 10
        b = 11
        c = 12

        # this comment; has several; semicolons
        if self.spam:
            x = 12
            return x # so; does; this; one
        x = 15
        y += x


        return y

    def baz(self):
        return self.bar
PaulMcG
  • 62,419
  • 16
  • 94
  • 130