Skip to content

Commit 9e0f70b

Browse files
authored
refactor: fix ops.StrftimeOp, ops.ToDatetimeOp, ops.ToTimestampOp in sqlglot compiler (#2297)
This change aims to fix the `to_datetime` related tests failing in #2248. Fixes internal issue 417774347 🦕
1 parent 6cdf64b commit 9e0f70b

File tree

9 files changed

+163
-51
lines changed

9 files changed

+163
-51
lines changed

bigframes/core/compile/sqlglot/expressions/datetime_ops.py

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616

1717
import sqlglot.expressions as sge
1818

19+
from bigframes import dtypes
1920
from bigframes import operations as ops
21+
from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS
2022
from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr
2123
import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler
2224

@@ -81,22 +83,73 @@ def _(expr: TypedExpr) -> sge.Expression:
8183

8284
@register_unary_op(ops.StrftimeOp, pass_op=True)
8385
def _(expr: TypedExpr, op: ops.StrftimeOp) -> sge.Expression:
84-
return sge.func("FORMAT_TIMESTAMP", sge.convert(op.date_format), expr.expr)
86+
func_name = ""
87+
if expr.dtype == dtypes.DATE_DTYPE:
88+
func_name = "FORMAT_DATE"
89+
elif expr.dtype == dtypes.DATETIME_DTYPE:
90+
func_name = "FORMAT_DATETIME"
91+
elif expr.dtype == dtypes.TIME_DTYPE:
92+
func_name = "FORMAT_TIME"
93+
elif expr.dtype == dtypes.TIMESTAMP_DTYPE:
94+
func_name = "FORMAT_TIMESTAMP"
95+
96+
return sge.func(func_name, sge.convert(op.date_format), expr.expr)
8597

8698

8799
@register_unary_op(ops.time_op)
88100
def _(expr: TypedExpr) -> sge.Expression:
89101
return sge.func("TIME", expr.expr)
90102

91103

92-
@register_unary_op(ops.ToDatetimeOp)
93-
def _(expr: TypedExpr) -> sge.Expression:
94-
return sge.Cast(this=sge.func("TIMESTAMP_SECONDS", expr.expr), to="DATETIME")
95-
104+
@register_unary_op(ops.ToDatetimeOp, pass_op=True)
105+
def _(expr: TypedExpr, op: ops.ToDatetimeOp) -> sge.Expression:
106+
if op.format:
107+
result = expr.expr
108+
if expr.dtype != dtypes.STRING_DTYPE:
109+
result = sge.Cast(this=result, to="STRING")
110+
result = sge.func(
111+
"PARSE_TIMESTAMP", sge.convert(op.format), result, sge.convert("UTC")
112+
)
113+
return sge.Cast(this=result, to="DATETIME")
114+
115+
if expr.dtype == dtypes.STRING_DTYPE:
116+
return sge.TryCast(this=expr.expr, to="DATETIME")
117+
118+
value = expr.expr
119+
unit = op.unit or "ns"
120+
factor = UNIT_TO_US_CONVERSION_FACTORS[unit]
121+
if factor != 1:
122+
value = sge.Mul(this=value, expression=sge.convert(factor))
123+
value = sge.func("TRUNC", value)
124+
return sge.Cast(
125+
this=sge.func("TIMESTAMP_MICROS", sge.Cast(this=value, to="INT64")),
126+
to="DATETIME",
127+
)
128+
129+
130+
@register_unary_op(ops.ToTimestampOp, pass_op=True)
131+
def _(expr: TypedExpr, op: ops.ToTimestampOp) -> sge.Expression:
132+
if op.format:
133+
result = expr.expr
134+
if expr.dtype != dtypes.STRING_DTYPE:
135+
result = sge.Cast(this=result, to="STRING")
136+
return sge.func(
137+
"PARSE_TIMESTAMP", sge.convert(op.format), expr.expr, sge.convert("UTC")
138+
)
96139

97-
@register_unary_op(ops.ToTimestampOp)
98-
def _(expr: TypedExpr) -> sge.Expression:
99-
return sge.func("TIMESTAMP_SECONDS", expr.expr)
140+
if expr.dtype == dtypes.STRING_DTYPE:
141+
return sge.func("TIMESTAMP", expr.expr)
142+
143+
value = expr.expr
144+
unit = op.unit or "ns"
145+
factor = UNIT_TO_US_CONVERSION_FACTORS[unit]
146+
if factor != 1:
147+
value = sge.Mul(this=value, expression=sge.convert(factor))
148+
value = sge.func("TRUNC", value)
149+
return sge.Cast(
150+
this=sge.func("TIMESTAMP_MICROS", sge.Cast(this=value, to="INT64")),
151+
to="TIMESTAMP",
152+
)
100153

101154

102155
@register_unary_op(ops.UnixMicros)

bigframes/core/compile/sqlglot/expressions/timedelta_ops.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import sqlglot.expressions as sge
1818

19+
from bigframes import dtypes
1920
from bigframes import operations as ops
2021
from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS
2122
from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr
@@ -32,7 +33,12 @@ def _(expr: TypedExpr) -> sge.Expression:
3233
@register_unary_op(ops.ToTimedeltaOp, pass_op=True)
3334
def _(expr: TypedExpr, op: ops.ToTimedeltaOp) -> sge.Expression:
3435
value = expr.expr
36+
if expr.dtype == dtypes.TIMEDELTA_DTYPE:
37+
return value
38+
3539
factor = UNIT_TO_US_CONVERSION_FACTORS[op.unit]
3640
if factor != 1:
3741
value = sge.Mul(this=value, expression=sge.convert(factor))
42+
if expr.dtype == dtypes.FLOAT_DTYPE:
43+
value = sge.Cast(this=sge.Floor(this=value), to=sge.DataType(this="INT64"))
3844
return value

bigframes/core/compile/sqlglot/sqlglot_ir.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
648648
elif dtype == dtypes.BYTES_DTYPE:
649649
return _cast(str(value), sqlglot_type)
650650
elif dtypes.is_time_like(dtype):
651+
if isinstance(value, str):
652+
return _cast(sge.convert(value), sqlglot_type)
651653
if isinstance(value, np.generic):
652654
value = value.item()
653655
return _cast(sge.convert(value.isoformat()), sqlglot_type)
Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
WITH `bfcte_0` AS (
22
SELECT
3+
`date_col`,
4+
`datetime_col`,
5+
`time_col`,
36
`timestamp_col`
47
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
58
), `bfcte_1` AS (
69
SELECT
710
*,
8-
FORMAT_TIMESTAMP('%Y-%m-%d', `timestamp_col`) AS `bfcol_1`
11+
FORMAT_DATE('%Y-%m-%d', `date_col`) AS `bfcol_8`,
12+
FORMAT_DATETIME('%Y-%m-%d', `datetime_col`) AS `bfcol_9`,
13+
FORMAT_TIME('%Y-%m-%d', `time_col`) AS `bfcol_10`,
14+
FORMAT_TIMESTAMP('%Y-%m-%d', `timestamp_col`) AS `bfcol_11`
915
FROM `bfcte_0`
1016
)
1117
SELECT
12-
`bfcol_1` AS `timestamp_col`
18+
`bfcol_8` AS `date_col`,
19+
`bfcol_9` AS `datetime_col`,
20+
`bfcol_10` AS `time_col`,
21+
`bfcol_11` AS `timestamp_col`
1322
FROM `bfcte_1`
Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
WITH `bfcte_0` AS (
22
SELECT
3-
`int64_col`
3+
`float64_col`,
4+
`int64_col`,
5+
`string_col`
46
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
57
), `bfcte_1` AS (
68
SELECT
79
*,
8-
CAST(TIMESTAMP_SECONDS(`int64_col`) AS DATETIME) AS `bfcol_1`
10+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS DATETIME) AS `bfcol_6`,
11+
SAFE_CAST(`string_col` AS DATETIME) AS `bfcol_7`,
12+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS DATETIME) AS `bfcol_8`
913
FROM `bfcte_0`
1014
)
1115
SELECT
12-
`bfcol_1` AS `int64_col`
16+
`bfcol_6` AS `int64_col`,
17+
`bfcol_7` AS `string_col`,
18+
`bfcol_8` AS `float64_col`
1319
FROM `bfcte_1`
Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,24 @@
11
WITH `bfcte_0` AS (
22
SELECT
3+
`float64_col`,
34
`int64_col`
45
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
56
), `bfcte_1` AS (
67
SELECT
78
*,
8-
TIMESTAMP_SECONDS(`int64_col`) AS `bfcol_1`
9+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `bfcol_2`,
10+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`float64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `bfcol_3`,
11+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000000) AS INT64)) AS TIMESTAMP) AS `bfcol_4`,
12+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 1000) AS INT64)) AS TIMESTAMP) AS `bfcol_5`,
13+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col`) AS INT64)) AS TIMESTAMP) AS `bfcol_6`,
14+
CAST(TIMESTAMP_MICROS(CAST(TRUNC(`int64_col` * 0.001) AS INT64)) AS TIMESTAMP) AS `bfcol_7`
915
FROM `bfcte_0`
1016
)
1117
SELECT
12-
`bfcol_1` AS `int64_col`
18+
`bfcol_2` AS `int64_col`,
19+
`bfcol_3` AS `float64_col`,
20+
`bfcol_4` AS `int64_col_s`,
21+
`bfcol_5` AS `int64_col_ms`,
22+
`bfcol_6` AS `int64_col_us`,
23+
`bfcol_7` AS `int64_col_ns`
1324
FROM `bfcte_1`
Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,54 @@
11
WITH `bfcte_0` AS (
22
SELECT
3+
`float64_col`,
34
`int64_col`,
45
`rowindex`
56
FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
67
), `bfcte_1` AS (
78
SELECT
89
*,
9-
`rowindex` AS `bfcol_4`,
10-
`int64_col` AS `bfcol_5`,
11-
`int64_col` AS `bfcol_6`
10+
`rowindex` AS `bfcol_6`,
11+
`int64_col` AS `bfcol_7`,
12+
`float64_col` AS `bfcol_8`,
13+
`int64_col` AS `bfcol_9`
1214
FROM `bfcte_0`
1315
), `bfcte_2` AS (
1416
SELECT
1517
*,
16-
`bfcol_4` AS `bfcol_10`,
17-
`bfcol_5` AS `bfcol_11`,
18-
`bfcol_6` AS `bfcol_12`,
19-
`bfcol_5` * 1000000 AS `bfcol_13`
18+
`bfcol_6` AS `bfcol_14`,
19+
`bfcol_7` AS `bfcol_15`,
20+
`bfcol_8` AS `bfcol_16`,
21+
`bfcol_9` AS `bfcol_17`,
22+
CAST(FLOOR(`bfcol_8` * 1000000) AS INT64) AS `bfcol_18`
2023
FROM `bfcte_1`
2124
), `bfcte_3` AS (
2225
SELECT
2326
*,
24-
`bfcol_10` AS `bfcol_18`,
25-
`bfcol_11` AS `bfcol_19`,
26-
`bfcol_12` AS `bfcol_20`,
27-
`bfcol_13` AS `bfcol_21`,
28-
`bfcol_11` * 604800000000 AS `bfcol_22`
27+
`bfcol_14` AS `bfcol_24`,
28+
`bfcol_15` AS `bfcol_25`,
29+
`bfcol_16` AS `bfcol_26`,
30+
`bfcol_17` AS `bfcol_27`,
31+
`bfcol_18` AS `bfcol_28`,
32+
`bfcol_15` * 3600000000 AS `bfcol_29`
2933
FROM `bfcte_2`
34+
), `bfcte_4` AS (
35+
SELECT
36+
*,
37+
`bfcol_24` AS `bfcol_36`,
38+
`bfcol_25` AS `bfcol_37`,
39+
`bfcol_26` AS `bfcol_38`,
40+
`bfcol_27` AS `bfcol_39`,
41+
`bfcol_28` AS `bfcol_40`,
42+
`bfcol_29` AS `bfcol_41`,
43+
`bfcol_27` AS `bfcol_42`
44+
FROM `bfcte_3`
3045
)
3146
SELECT
32-
`bfcol_18` AS `rowindex`,
33-
`bfcol_19` AS `int64_col`,
34-
`bfcol_20` AS `duration_us`,
35-
`bfcol_21` AS `duration_s`,
36-
`bfcol_22` AS `duration_w`
37-
FROM `bfcte_3`
47+
`bfcol_36` AS `rowindex`,
48+
`bfcol_37` AS `int64_col`,
49+
`bfcol_38` AS `float64_col`,
50+
`bfcol_39` AS `duration_us`,
51+
`bfcol_40` AS `duration_s`,
52+
`bfcol_41` AS `duration_w`,
53+
`bfcol_42` AS `duration_on_duration`
54+
FROM `bfcte_4`

tests/unit/core/compile/sqlglot/expressions/test_datetime_ops.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,15 @@ def test_second(scalar_types_df: bpd.DataFrame, snapshot):
143143

144144

145145
def test_strftime(scalar_types_df: bpd.DataFrame, snapshot):
146-
col_name = "timestamp_col"
147-
bf_df = scalar_types_df[[col_name]]
148-
sql = utils._apply_ops_to_sql(
149-
bf_df, [ops.StrftimeOp("%Y-%m-%d").as_expr(col_name)], [col_name]
150-
)
146+
bf_df = scalar_types_df[["timestamp_col", "datetime_col", "date_col", "time_col"]]
147+
ops_map = {
148+
"date_col": ops.StrftimeOp("%Y-%m-%d").as_expr("date_col"),
149+
"datetime_col": ops.StrftimeOp("%Y-%m-%d").as_expr("datetime_col"),
150+
"time_col": ops.StrftimeOp("%Y-%m-%d").as_expr("time_col"),
151+
"timestamp_col": ops.StrftimeOp("%Y-%m-%d").as_expr("timestamp_col"),
152+
}
151153

154+
sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys()))
152155
snapshot.assert_match(sql, "out.sql")
153156

154157

@@ -161,22 +164,26 @@ def test_time(scalar_types_df: bpd.DataFrame, snapshot):
161164

162165

163166
def test_to_datetime(scalar_types_df: bpd.DataFrame, snapshot):
164-
col_name = "int64_col"
165-
bf_df = scalar_types_df[[col_name]]
166-
sql = utils._apply_ops_to_sql(
167-
bf_df, [ops.ToDatetimeOp().as_expr(col_name)], [col_name]
168-
)
167+
col_names = ["int64_col", "string_col", "float64_col"]
168+
bf_df = scalar_types_df[col_names]
169+
ops_map = {col_name: ops.ToDatetimeOp().as_expr(col_name) for col_name in col_names}
169170

171+
sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys()))
170172
snapshot.assert_match(sql, "out.sql")
171173

172174

173175
def test_to_timestamp(scalar_types_df: bpd.DataFrame, snapshot):
174-
col_name = "int64_col"
175-
bf_df = scalar_types_df[[col_name]]
176-
sql = utils._apply_ops_to_sql(
177-
bf_df, [ops.ToTimestampOp().as_expr(col_name)], [col_name]
178-
)
176+
bf_df = scalar_types_df[["int64_col", "string_col", "float64_col"]]
177+
ops_map = {
178+
"int64_col": ops.ToTimestampOp().as_expr("int64_col"),
179+
"float64_col": ops.ToTimestampOp().as_expr("float64_col"),
180+
"int64_col_s": ops.ToTimestampOp(unit="s").as_expr("int64_col"),
181+
"int64_col_ms": ops.ToTimestampOp(unit="ms").as_expr("int64_col"),
182+
"int64_col_us": ops.ToTimestampOp(unit="us").as_expr("int64_col"),
183+
"int64_col_ns": ops.ToTimestampOp(unit="ns").as_expr("int64_col"),
184+
}
179185

186+
sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys()))
180187
snapshot.assert_match(sql, "out.sql")
181188

182189

tests/unit/core/compile/sqlglot/expressions/test_timedelta_ops.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@
2222

2323

2424
def test_to_timedelta(scalar_types_df: bpd.DataFrame, snapshot):
25-
bf_df = scalar_types_df[["int64_col"]]
25+
bf_df = scalar_types_df[["int64_col", "float64_col"]]
2626
bf_df["duration_us"] = bpd.to_timedelta(bf_df["int64_col"], "us")
27-
bf_df["duration_s"] = bpd.to_timedelta(bf_df["int64_col"], "s")
28-
bf_df["duration_w"] = bpd.to_timedelta(bf_df["int64_col"], "W")
27+
bf_df["duration_s"] = bpd.to_timedelta(bf_df["float64_col"], "s")
28+
bf_df["duration_w"] = bpd.to_timedelta(bf_df["int64_col"], "h")
29+
bf_df["duration_on_duration"] = bpd.to_timedelta(bf_df["duration_us"], "ms")
2930

3031
snapshot.assert_match(bf_df.sql, "out.sql")
3132

0 commit comments

Comments
 (0)