Conversation
- remove inactive users - add myself
bigframes/core/indexes/base.py
Outdated
| def to_frame( | ||
| self, index: bool = True, name: blocks.Label | None = None | ||
| ) -> bigframes.dataframe.DataFrame: | ||
| import numpy as np | ||
|
|
||
| provided_name = name if name else self.name | ||
| provided_index = self.values if index else np.arange(len(self.values)) | ||
| result = bigframes.dataframe.DataFrame( | ||
| {provided_name: self.values}, index=provided_index | ||
| ) | ||
| if index: # matching pandas behavior | ||
| result.index.name = self.name | ||
|
|
||
| return result |
bigframes/core/indexes/base.py
Outdated
| import numpy as np | ||
| provided_name = name if name else self.name | ||
| provided_index = self.values if index else np.arange(len(self.values)) | ||
| result = bigframes.dataframe.DataFrame({provided_name: self.values}, index= provided_index) |
There was a problem hiding this comment.
If the index is a MultiIndex, the resulting DataFrame will need to have multiple columns (and names will need to be a list-like with length equal to the multi-index level depth).
bigframes/core/indexes/base.py
Outdated
| ) -> bigframes.dataframe.DataFrame: | ||
| import numpy as np | ||
| provided_name = name if name else self.name | ||
| provided_index = self.values if index else np.arange(len(self.values)) |
There was a problem hiding this comment.
Just provide None if want default sequential index. np.arrange may create a very large in-memory array here if the original index is large (some BigFrames objects have billions of rows).
bigframes/core/indexes/base.py
Outdated
| def to_frame( | ||
| self, index: bool = True, name: blocks.Label | None = None | ||
| ) -> bigframes.dataframe.DataFrame: | ||
| import numpy as np |
There was a problem hiding this comment.
Put the import at the top of the file for easier management. Actually numpy is already imported in the file.
Import in the method only to avoid circular imports.
bigframes/core/indexes/base.py
Outdated
| index._linked_frame = frame | ||
| return index | ||
|
|
||
| def to_frame(self, index: bool = True, name=None) -> bigframes.dataframe.DataFrame: |
There was a problem hiding this comment.
Can we add back type hint for "name"? Why it is removed?
There was a problem hiding this comment.
added back, still getting familiar with static type checker in Python
bigframes/core/indexes/base.py
Outdated
|
|
||
| multi = isinstance(self, MultiIndex) | ||
|
|
||
| if multi: |
There was a problem hiding this comment.
In stead of checking type and do everything in base class, should override the method in derived class(MultiIndex).
bigframes/core/indexes/base.py
Outdated
|
|
||
| if multi: | ||
| columns = [ | ||
| [self.values[j][i] for j in range(len(self.values))] |
There was a problem hiding this comment.
self.values calls to_numpy, which downloads everything and not scalable. Maybe take a look at Index.from_frame() and Series.to_frame()
There was a problem hiding this comment.
fixed for Index. per offline conversation with @TrevorBergeron, efficient MultiIndex implementation is blocked by bugs in DataFrame construction with MultiIndex
bug TL;DR
- Index with array values being created for DataFrame instead of MultiIndex
bf_idx = indexes.MultiIndex.from_arrays([["a", "b", "c"], ["d", "e", "f"]])
test = bigframes.dataframe.DataFrame({1: [2, 4, 6], 2: [1, 3, 5]}, index = bf_idx)
print(test)
- block joining for DataFrame creation using MultiIndex and Dict of Series data throws
bf_idx = indexes.MultiIndex.from_arrays([["a", "b", "c"], ["d", "e", "f"]])
nlevels = bf_idx.nlevels
columns : list[bigframes.series.Series] = []
for level in range(nlevels):
series = self.get_level_values(level).to_series()
columns.append(series)
data = {i: column for i, column in enumerate(columns)}
result = bigframes.dataframe.DataFrame(
data, index= bf_idx
)
tests/system/small/test_index.py
Outdated
| ) | ||
| bf_idx = indexes.Index(["Ant", "Bear", "Cow"], name="animal") | ||
|
|
||
| for index_arg, name_arg in itertools.product( |
There was a problem hiding this comment.
you can use @pytest.mark.parametrize to achieve combinations of params.
Which creates multiple separate tests. Easier to find the problem if anyone fails.
| ) | ||
| bf_idx = indexes.Index(["Ant", "Bear", "Cow"], name="animal") | ||
|
|
||
| if name_arg is None: |
There was a problem hiding this comment.
It doesn't need the if else condition. Just
pd_df = pd_idx.to_frame(index=index_arg, name=name_arg)
bf_df = bf_idx.to_frame(index=index_arg, name=name_arg)
is good enough
There was a problem hiding this comment.
Pandas implementation of name arg handling is slightly different. If we set name=None for pandas, it will create a DataFrame with column names as string None, since technically the default is lib.nodefault (code pointer: https://github.com/pandas-dev/pandas/blob/v2.2.2/pandas/_libs/lib.pyi#L32), whereas use None: https://github.com/pandas-dev/pandas/blob/v2.2.2/pandas/core/indexes/base.py#L1607-L1666
There was a problem hiding this comment.
I can add something similar to BigFrames if you think it is better to exactly mirror pandas functionality: I am equally happy either way
There was a problem hiding this comment.
Ah, that's a weird behavior of pandas. Thanks for pointing out. Then we shouldn't let the default to be None. But should have sth similar. @TrevorBergeron
|
|
||
| pd_idx = pandas.MultiIndex.from_arrays([["a", "b", "c"], ["d", "e", "f"]]) | ||
| bf_idx = indexes.MultiIndex.from_arrays([["a", "b", "c"], ["d", "e", "f"]]) | ||
| if name_arg is None: |
|
Migration Notice: This library is moving to the google-cloud-python monorepo soon. We closed this PR due to inactivity to ensure a clean migration. Please re-open this work in the new monorepo once the migration is complete! |
Pull request was closed
Intern Starter Task
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly:
can test by checking:
Fixes internal 356891401 🦕