Using any POSIX awk:
$ cat tst.awk BEGIN { inStep = 4 print "{" } sub(/:$/,"") { indent = inStep printf "%*s\"%s\": {\n", indent, "", $0 next } !numDates && /^[[:space:]]/ { numDates = split($0,dates) next } numDates && match($0,"[[:space:]]+([^[:space:]]+[[:space:]]*){"numDates"}$") { indent += inStep printf "%s%*s\"%s\": {\n", (numItems++ ? ",\n" : ""), indent, "", substr($0,1,RSTART-1) indent += inStep $0 = substr($0,RSTART,RLENGTH) for ( i=1; i<=numDates; i++ ) { printf "%*s\"%s\": \"%s\"%s\n", indent, "", dates[i], $i, (i<numDates ? "," : "") } indent -= inStep printf "%*s}", indent, "" indent -= inStep } END { printf "\n%*s}\n", indent, "" print "}" }
$ awk -f tst.awk file { "Balance_sheet for AAPL": { "Treasury Shares Number": { "2023-09-30": "0.0", "2022-09-30": "NaN", "2021-09-30": "NaN", "2020-09-30": "NaN" }, "Ordinary Shares Number": { "2023-09-30": "15550061000.0", "2022-09-30": "15943425000.0", "2021-09-30": "16426786000.0", "2020-09-30": "16976763000.0" } } }
If you need to handle multiple "Balance Sheet" blocks then just add this:
if ( numTables++ ) { printf "\n%*s},\n", indent, "" } numDates = numItems = 0
immediately below the sub() line, e.g. given this input:
$ cat file2 Balance_sheet for AAPL: 2023-09-30 2022-09-30 2021-09-30 2020-09-30 Treasury Shares Number 0.0 NaN NaN NaN Ordinary Shares Number 15550061000.0 15943425000.0 16426786000.0 16976763000.0 Balance_sheet for foo: 2023-09-30 2022-09-30 2021-09-30 2020-09-30 Treasury Shares Number 0.0 NaN NaN NaN Ordinary Shares Number 15550061000.0 15943425000.0 16426786000.0 16976763000.0
this script:
$ cat tst.awk BEGIN { inStep = 4 print "{" } sub(/:$/,"") { if ( numTables++ ) { printf "\n%*s},\n", indent, "" } numDates = numItems = 0 indent = inStep printf "%*s\"%s\": {\n", indent, "", $0 next } !numDates && /^[[:space:]]/ { numDates = split($0,dates) next } numDates && match($0,"[[:space:]]+([^[:space:]]+[[:space:]]*){"numDates"}$") { indent += inStep printf "%s%*s\"%s\": {\n", (numItems++ ? ",\n" : ""), indent, "", substr($0,1,RSTART-1) indent += inStep $0 = substr($0,RSTART,RLENGTH) for ( i=1; i<=numDates; i++ ) { printf "%*s\"%s\": \"%s\"%s\n", indent, "", dates[i], $i, (i<numDates ? "," : "") } indent -= inStep printf "%*s}", indent, "" indent -= inStep } END { printf "\n%*s}\n", indent, "" print "}" }
will produce this output:
$ awk -f tst.awk file2 { "Balance_sheet for AAPL": { "Treasury Shares Number": { "2023-09-30": "0.0", "2022-09-30": "NaN", "2021-09-30": "NaN", "2020-09-30": "NaN" }, "Ordinary Shares Number": { "2023-09-30": "15550061000.0", "2022-09-30": "15943425000.0", "2021-09-30": "16426786000.0", "2020-09-30": "16976763000.0" } }, "Balance_sheet for foo": { "Treasury Shares Number": { "2023-09-30": "0.0", "2022-09-30": "NaN", "2021-09-30": "NaN", "2020-09-30": "NaN" }, "Ordinary Shares Number": { "2023-09-30": "15550061000.0", "2022-09-30": "15943425000.0", "2021-09-30": "16426786000.0", "2020-09-30": "16976763000.0" } } }