To get the unique dates, use SeriesGroupBy.unique()
. To count the number of unique id2
in each group, use SeriesGroupBy.nunique()
.
temp = data[['id', 'date', 'id2']].drop_duplicates()
temp.groupby('id', as_index=False).agg({'date': 'unique', 'id2': 'nunique'})
Not dropping duplicates beforehand may be faster — pandas only has to iterate once over all your data instead of twice.
data.groupby('id', as_index=False).agg({'date': 'unique', 'id2': 'nunique'})
EDIT:
Here are some benchmarks. Interestingly, SeriesGroupBy.unique()
and SeriesGroupBy.nunique()
do not seem to be faster than using sets. But not dropping duplicates before is.
import io
import pandas as pd
raw = io.StringIO("""\
id id2 date value
0 33208381500016 1927637 2014-07-31 120.0
1 77874276700016 3418498 2014-11-22 10.5
2 77874276700016 1174018 2014-11-22 8.4
3 77874276700016 1174018 2014-11-20 1.4
4 77874276700016 1643839 2014-06-27 4.2
5 77874276700016 1972929 2014-06-27 6.7
6 77874276700016 1972929 2014-06-27 12.7
7 77874276700016 1588191 2014-02-20 123.4
8 77874276700016 1966627 2014-02-20 973.1
9 77874276700016 1830252 2014-02-20 0.5
""")
data = pd.read_csv(raw, delim_whitespace=True)
def using_sets_drop_then_group():
temp = data[['id', 'date', 'id2']].drop_duplicates()
temp.groupby('id', as_index=False).agg({'date': lambda x: set(x),
'id2': lambda x: len(set(x))})
def using_sets_drop_just_group():
data.groupby('id', as_index=False).agg({'date': lambda x: set(x),
'id2': lambda x: len(set(x))})
def using_unique_drop_then_group():
temp = data[['id', 'date', 'id2']].drop_duplicates()
temp.groupby('id', as_index=False).agg({'date': 'unique', 'id2': 'nunique'})
def using_unique_just_group():
data.groupby('id', as_index=False).agg({'date': 'unique', 'id2': 'nunique'})
%timeit using_sets_drop_then_group() # => 100 loops, best of 3: 4.82 ms per loop
%timeit using_sets_drop_just_group() # => 100 loops, best of 3: 2.91 ms per loop
%timeit using_unique_drop_then_group() # => 100 loops, best of 3: 5.14 ms per loop
%timeit using_unique_just_group() # => 100 loops, best of 3: 3.26 ms per loop
EDIT:
In a comment, @ptrj suggests SeriesGroupBy.unique()
and SeriesGroupBy.nunique()
may be faster if dates are converted to datetime64
. Alas it does not seem to be the case, at least for this small sample of data.
data['parsed_date'] = pd.to_datetime(data['date'])
def using_sets_and_datetime64():
data.groupby('id', as_index=False).agg({'parsed_date': lambda x: set(x),
'id2': lambda x: len(set(x))})
def using_unique_and_datetime64():
data.groupby('id', as_index=False).agg({'parsed_date': 'unique',
'id2': 'nunique'})
%timeit using_sets_and_datetime64() # => 100 loops, best of 3: 3.2 ms per loop
%timeit using_unique_and_datetime64() # => 100 loops, best of 3: 3.53 ms per loop
EDIT:
@MaxU's suggestion of concatenating 100,000 copies of the sample data indeed leads to SeriesGroupBy.unique()
and SeriesGroupBy.nunique()
outperforming set
.
large_data = pd.concat([data] * 10**5, ignore_index=True)
def using_sets():
large_data.groupby('id', as_index=False).agg({'date': lambda x: set(x),
'id2': lambda x: len(set(x))})
def using_unique():
large_data.groupby('id', as_index=False).agg({'date': 'unique',
'id2': 'nunique'})
def using_sets_and_datetime64():
large_data.groupby('id', as_index=False).agg({'parsed_date': lambda x: set(x),
'id2': lambda x: len(set(x))})
def using_unique_and_datetime64():
large_data.groupby('id', as_index=False).agg({'parsed_date': 'unique',
'id2': 'nunique'})
%timeit using_sets() # => 1 loops, best of 3: 295 ms per loop
%timeit using_unique() # => 1 loops, best of 3: 327 ms per loop
%timeit using_sets_and_datetime64() # => 1 loops, best of 3: 5.02 s per loop
%timeit using_unique_and_datetime64() # => 1 loops, best of 3: 248 ms per loop