1

I have a dataframe like as shown below

df = pd.DataFrame({
'subject_ID':[1,2,3,4,5,6,7,8],
'1st_date':['1/1/2020','3/3/2000','13/11/2020','24/05/1998','30/03/1971','30/03/1971','30/03/1971','30/03/1971'],
'1st_marks':[31,32,34,45,56,78,74,32],
'1st_1st_retest_marks':[31,32,34,45,56,78,74,32],
'1st_2nd_retest_marks':[31,32,34,45,56,78,74,32],
'2nd_date':['1/2/2020','3/4/2000','13/12/2020','24/06/1998','30/04/1971','21/04/1971','10/04/1971','20/04/1971'],
'2nd_marks':[31,32,34,45,56,78,74,32],
'3rd_date':['1/1/2010','3/3/2005','13/11/2021','24/05/1898','30/03/1981','30/03/1991','30/03/1901','30/03/1871'],
'3rd_marks':[31,32,34,45,56,78,74,32]})

I tried the below

df = pd.melt(df, id_vars =['subject_ID']) # incorrect output
df = pd.melt(df,id_vars = ['subject_ID','1st_date'] #incorrect output

In my real data, I have more than 100 date columns and corresponding mark values for every subject.

How can I pass all 100 dates as input to melt function

I expect my output to be like as shown below (sample for subject_id = 1)

Please don;t use any pattern from column names as in real data, the column names doesn;t have any pattern like 1st, 2nd, 3rd etc

enter image description here

The Great
  • 7,215
  • 7
  • 40
  • 128

2 Answers2

2

If there is separator _ use it for splitting to MultiIndex, so possible use DataFrame.stack:

df = df.set_index('subject_ID')
df.columns = df.columns.str.split('_', expand=True)
df = df.stack(0).rename_axis(['subject_ID','tmp']).reset_index()
df['mark_variable'] = df['tmp'] + '_marks'
df['date_variable'] = df.pop('tmp') + '_date'

print (df)
    subject_ID        date  marks mark_variable date_variable
0            1    1/1/2020     31     1st_marks      1st_date
1            1    1/2/2020     31     2nd_marks      2nd_date
2            1    1/1/2010     31     3rd_marks      3rd_date
3            2    3/3/2000     32     1st_marks      1st_date
4            2    3/4/2000     32     2nd_marks      2nd_date
5            2    3/3/2005     32     3rd_marks      3rd_date
6            3  13/11/2020     34     1st_marks      1st_date
7            3  13/12/2020     34     2nd_marks      2nd_date
8            3  13/11/2021     34     3rd_marks      3rd_date
9            4  24/05/1998     45     1st_marks      1st_date
10           4  24/06/1998     45     2nd_marks      2nd_date
11           4  24/05/1898     45     3rd_marks      3rd_date
12           5  30/03/1971     56     1st_marks      1st_date
13           5  30/04/1971     56     2nd_marks      2nd_date
14           5  30/03/1981     56     3rd_marks      3rd_date
15           6  30/03/1971     78     1st_marks      1st_date
16           6  21/04/1971     78     2nd_marks      2nd_date
17           6  30/03/1991     78     3rd_marks      3rd_date
18           7  30/03/1971     74     1st_marks      1st_date
19           7  10/04/1971     74     2nd_marks      2nd_date
20           7  30/03/1901     74     3rd_marks      3rd_date
21           8  30/03/1971     32     1st_marks      1st_date
22           8  20/04/1971     32     2nd_marks      2nd_date
23           8  30/03/1871     32     3rd_marks      3rd_date

EDIT:

#convert ID column to index first
df = df.set_index('subject_ID')

#groups with first column date
g = df.columns.str.contains('date').cumsum()
#per each group reshape by stack with first date column
d = {x.columns[0]: x.set_index(x.columns[0], append=True).stack()
     for i, x in df.groupby(g, axis=1)}

#rename columns
renamer = {'level_0':'date_variable','level_2':'date','level_3':'mark_variable'}
#join together dictionary, sorting by `ID` and rename columns
df = (pd.concat(d)
        .sort_index(level=1, sort_remaining=False)
        .reset_index(name='mark')
        .rename(columns=renamer))

print (df)
   date_variable  subject_ID        date         mark_variable  mark
0       1st_date           1    1/1/2020             1st_marks    31
1       1st_date           1    1/1/2020  1st_1st_retest_marks    31
2       1st_date           1    1/1/2020  1st_2nd_retest_marks    31
3       2nd_date           1    1/2/2020             2nd_marks    31
4       3rd_date           1    1/1/2010             3rd_marks    31
5       1st_date           2    3/3/2000             1st_marks    32
6       1st_date           2    3/3/2000  1st_1st_retest_marks    32
7       1st_date           2    3/3/2000  1st_2nd_retest_marks    32
8       2nd_date           2    3/4/2000             2nd_marks    32
9       3rd_date           2    3/3/2005             3rd_marks    32
10      1st_date           3  13/11/2020             1st_marks    34
11      1st_date           3  13/11/2020  1st_1st_retest_marks    34
12      1st_date           3  13/11/2020  1st_2nd_retest_marks    34
13      2nd_date           3  13/12/2020             2nd_marks    34
14      3rd_date           3  13/11/2021             3rd_marks    34
15      1st_date           4  24/05/1998             1st_marks    45
16      1st_date           4  24/05/1998  1st_1st_retest_marks    45
17      1st_date           4  24/05/1998  1st_2nd_retest_marks    45
18      2nd_date           4  24/06/1998             2nd_marks    45
19      3rd_date           4  24/05/1898             3rd_marks    45
20      1st_date           5  30/03/1971             1st_marks    56
21      1st_date           5  30/03/1971  1st_1st_retest_marks    56
22      1st_date           5  30/03/1971  1st_2nd_retest_marks    56
23      2nd_date           5  30/04/1971             2nd_marks    56
24      3rd_date           5  30/03/1981             3rd_marks    56
25      1st_date           6  30/03/1971             1st_marks    78
26      1st_date           6  30/03/1971  1st_1st_retest_marks    78
27      1st_date           6  30/03/1971  1st_2nd_retest_marks    78
28      2nd_date           6  21/04/1971             2nd_marks    78
29      3rd_date           6  30/03/1991             3rd_marks    78
30      1st_date           7  30/03/1971             1st_marks    74
31      1st_date           7  30/03/1971  1st_1st_retest_marks    74
32      1st_date           7  30/03/1971  1st_2nd_retest_marks    74
33      2nd_date           7  10/04/1971             2nd_marks    74
34      3rd_date           7  30/03/1901             3rd_marks    74
35      1st_date           8  30/03/1971             1st_marks    32
36      1st_date           8  30/03/1971  1st_1st_retest_marks    32
37      1st_date           8  30/03/1971  1st_2nd_retest_marks    32
38      2nd_date           8  20/04/1971             2nd_marks    32
39      3rd_date           8  30/03/1871             3rd_marks    32
jezrael
  • 822,522
  • 95
  • 1,334
  • 1,252
  • thanks, upvoted. But is there anyway to do without str split `_`. because, am afraid in real data there is no pattern like that... Except all date columns have a keyword `date` in them and value (marks) column doesn't have any fixed pattern as shown in example above – The Great Jul 26 '21 at 12:21
  • let this answer be there,. don't remove it. but just wanted to know whether it can be done without split char pattern, – The Great Jul 26 '21 at 12:23
  • I updated the sample dataframe and expected output. – The Great Jul 26 '21 at 12:28
  • For `retest` variables, we put `1st_date` as date variable because these columns are ordered before the `2nd_date` column. So, it takes the 1st_date as value for date variable column – The Great Jul 26 '21 at 12:30
  • @TheGreat - `1st_date` is datetime? – jezrael Jul 26 '21 at 12:32
  • In real data the `date columns` don't have the name as `1st_date`, `2nd_date` etc. some of the records just have `date` value only and some records have both ` date and timestamp` – The Great Jul 26 '21 at 12:34
  • In real data, the date columns are named as `test_date`, `measurement_date`, `exam_date`, `investigation_date` etc. There is no set pattern (except the presence of ketwords `date`). Additionally, the value (mark) column doesn't have any pattern at all. In real data, the value columns are named as `abc`, `hba1c`, `ldl`, `hdl` etc – The Great Jul 26 '21 at 12:36
  • @TheGreat - So question is how matching them? If there is no `_` ? – jezrael Jul 26 '21 at 12:44
  • 1
    By the column position...if a value column (non date column) appears after a date column, then those values (marks) belong to columns that appear in the left – The Great Jul 26 '21 at 12:46
  • 1
    @TheGreat - Understand. So there is word `date` in columns names ? – jezrael Jul 26 '21 at 12:47
  • 1
    for ex, 1st_1st_retest_marks and 1st_2nd_retest_marks should be assigned 1st date because that's the date column to the left... – The Great Jul 26 '21 at 12:47
  • for all values, we consider their immediate left date column. We don't consider date columns for the right.. for ex: for 2nd_mark, we only consider 2nd_date (because it's before (left of) the 2nd_mark column.). So, we don't consider 3rd_date for 2nd_mark because it's in right (in column ordering it appears after 2nd_mark).. we only want to attach date columns that appear before(left of) the value(mark) column – The Great Jul 26 '21 at 12:51
  • @TheGreat - Solution added. – jezrael Jul 26 '21 at 13:19
  • Thanks a lot for your help. Accepted. Will try and let you know for any questions. Thanks omce again – The Great Jul 26 '21 at 13:26
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/235330/discussion-between-the-great-and-jezrael). – The Great Jul 27 '21 at 08:47
2

You could use pivot_longer from pyjanitor (they are wrappers around pandas functions), to reshape the data:

 # pip install pyjanitor
 import pandas as pd
 import janitor
 (df.pivot_longer(index = ['subject_ID', '*date'],
                  # any part of the column label associated 
                  # with .value stays as a column name
                  # the rest are collated under mark_variable 
                  names_to = ('mark_variable', '.value'), 
                  # this determines how the column labels are split
                  # there are two groups, to pair with the names
                  # in `names_to`
                  names_pattern=r"(.+)_(marks)$")
     # a second `melt` to get date_variable
    .pivot_longer(['subject_ID', '*mark*'], 
                  names_to = 'date_variable', 
                  values_to='date')
     # if data is large, a more efficient option here
     # would be to convert mark_variable to a categorical column
     # and then rename the categories ... pretty efficient
    .assign(mark_variable = lambda df: df.mark_variable + "_marks")
    .sort_values(['subject_ID', 'date'], ignore_index=True)
    .head(10)
)

   subject_ID         mark_variable  marks date_variable      date
0           1             1st_marks     31      3rd_date  1/1/2010
1           1  1st_1st_retest_marks     31      3rd_date  1/1/2010
2           1  1st_2nd_retest_marks     31      3rd_date  1/1/2010
3           1             2nd_marks     31      3rd_date  1/1/2010
4           1             3rd_marks     31      3rd_date  1/1/2010
5           1             1st_marks     31      1st_date  1/1/2020
6           1  1st_1st_retest_marks     31      1st_date  1/1/2020
7           1  1st_2nd_retest_marks     31      1st_date  1/1/2020
8           1             2nd_marks     31      1st_date  1/1/2020
9           1             3rd_marks     31      1st_date  1/1/2020
sammywemmy
  • 27,093
  • 4
  • 17
  • 31