Skip to content
Prev Previous commit
Next Next commit
PERF: improved perf in .to_json when lines=True
closes #14408
  • Loading branch information
jreback committed Oct 15, 2016
commit 7cad3f16bccd1d4702ef9d038b1ee0db33b9bb94
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.19.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ Highlights include:
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)

- Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)
- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`)



Expand Down
22 changes: 3 additions & 19 deletions pandas/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,25 +605,9 @@ def _convert_to_line_delimits(s):
if not s[0] == '[' and s[-1] == ']':
return s
s = s[1:-1]
num_open_brackets_seen = 0
commas_to_replace = []
in_quotes = False
for idx, char in enumerate(s): # iter through to find all
if char == '"' and idx > 0 and s[idx - 1] != '\\':
in_quotes = ~in_quotes
elif char == ',': # commas that should be \n
if num_open_brackets_seen == 0 and not in_quotes:
commas_to_replace.append(idx)
elif char == '{':
if not in_quotes:
num_open_brackets_seen += 1
elif char == '}':
if not in_quotes:
num_open_brackets_seen -= 1
s_arr = np.array(list(s)) # Turn to an array to set
s_arr[commas_to_replace] = '\n' # all commas at once.
s = ''.join(s_arr)
return s

from pandas.lib import convert_json_to_lines
return convert_json_to_lines(s)


def nested_to_record(ds, prefix="", level=0):
Expand Down
38 changes: 38 additions & 0 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,44 @@ def string_array_replace_from_nan_rep(
return arr


@cython.boundscheck(False)
@cython.wraparound(False)
def convert_json_to_lines(object arr):
"""
replace comma separated json with line feeds, paying special attention
to quotes & brackets
"""
cdef:
Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length
ndarray[uint8_t] narr
unsigned char v, comma, left_bracket, right_brack, newline

newline = ord('\n')
comma = ord(',')
left_bracket = ord('{')
right_bracket = ord('}')
quote = ord('"')
backslash = ord('\\')

narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
length = narr.shape[0]
for i in range(length):
v = narr[i]
if v == quote and i > 0 and narr[i - 1] != backslash:
in_quotes = ~in_quotes
if v == comma: # commas that should be \n
if num_open_brackets_seen == 0 and not in_quotes:
narr[i] = newline
elif v == left_bracket:
if not in_quotes:
num_open_brackets_seen += 1
elif v == right_bracket:
if not in_quotes:
num_open_brackets_seen -= 1

return narr.tostring().decode('utf-8')


@cython.boundscheck(False)
@cython.wraparound(False)
def write_csv_rows(list data, ndarray data_index,
Expand Down