-
Notifications
You must be signed in to change notification settings - Fork 61
feat: Allow join-free alignment of analytic expressions #1168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4495c7f
8db8520
5ae20db
4290229
ba077ee
def5b25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2341,7 +2341,9 @@ def join( | |
# Handle null index, which only supports row join | ||
# This is the canonical way of aligning on null index, so always allow (ignore block_identity_join) | ||
if self.index.nlevels == other.index.nlevels == 0: | ||
result = try_row_join(self, other, how=how) | ||
result = try_legacy_row_join(self, other, how=how) or try_new_row_join( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not needed now, as now emulating more recent pandas alignment logic, where the sides need to match exactly, without any view-like interpretations of filtered objects. |
||
self, other | ||
) | ||
if result is not None: | ||
return result | ||
raise bigframes.exceptions.NullIndexError( | ||
|
@@ -2354,7 +2356,9 @@ def join( | |
and (self.index.nlevels == other.index.nlevels) | ||
and (self.index.dtypes == other.index.dtypes) | ||
): | ||
result = try_row_join(self, other, how=how) | ||
result = try_legacy_row_join(self, other, how=how) or try_new_row_join( | ||
self, other | ||
) | ||
if result is not None: | ||
return result | ||
|
||
|
@@ -2693,7 +2697,35 @@ def is_uniquely_named(self: BlockIndexProperties): | |
return len(set(self.names)) == len(self.names) | ||
|
||
|
||
def try_row_join( | ||
def try_new_row_join( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto, re |
||
left: Block, right: Block | ||
) -> Optional[Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]]: | ||
join_keys = tuple( | ||
(left_id, right_id) | ||
for left_id, right_id in zip(left.index_columns, right.index_columns) | ||
) | ||
join_result = left.expr.try_row_join(right.expr, join_keys) | ||
if join_result is None: # did not succeed | ||
return None | ||
combined_expr, (get_column_left, get_column_right) = join_result | ||
# Keep the left index column, and drop the matching right column | ||
index_cols_post_join = [get_column_left[id] for id in left.index_columns] | ||
combined_expr = combined_expr.drop_columns( | ||
[get_column_right[id] for id in right.index_columns] | ||
) | ||
block = Block( | ||
combined_expr, | ||
index_columns=index_cols_post_join, | ||
column_labels=left.column_labels.append(right.column_labels), | ||
index_labels=left.index.names, | ||
) | ||
return ( | ||
block, | ||
(get_column_left, get_column_right), | ||
) | ||
|
||
|
||
def try_legacy_row_join( | ||
left: Block, | ||
right: Block, | ||
*, | ||
|
@@ -2727,7 +2759,7 @@ def try_row_join( | |
) | ||
for id in right.value_columns | ||
] | ||
combined_expr = left_expr.try_align_as_projection( | ||
combined_expr = left_expr.try_legacy_row_join( | ||
right_expr, | ||
join_type=how, | ||
join_keys=join_keys, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this make the column IDs less deterministic than the previous logic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not any less deterministic than we are now. If we want an isomorphism between query structure and output syntax, there is a bit more work that needs to be done to the system, which I think basically amounts to late binding identifiers serially through the tree.