0

I have survey date with 5 main variables f1_, f2_, f3_ ,f4_ and f5_ and each f*_ group variable has up to 10 sub groups, ex: f1_1 , f1_2 ,f1_3 ... or f2_1, f2_2, ... f2_10.

I would like to perform a pivot_longer to reshape my dataframe in order to do my analysis, I am R user and did it like this and I would like to know how can I achieve the same output via python, pandas.

df %>% 
  # Reshape data - to long
  pivot_longer(cols = all_of(ends_with(c("1","2","3", "4" ,"5"))), names_to = c("name", "check_id"), names_pattern = "(.*)(.)") %>% 
  # Reshape data - to wide
  pivot_wider(names_from = name) %>% 
  #unnest data
  unnest() %>% 
  # remove row if it has a NA value in both column
  filter_at(.vars = vars(one_of(c("f1_", "f2_"))),~ !is.na(.)) %>%
  # Crosstab 3 way
  tabyl(check_id, f1_ ,f2_ ) %>% 
  # add total row and col
  adorn_totals(c("row", "col" ))

Here is the desired output:

$No
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        2        0        0        0     2
        2        0        1        0        0     1
        3        0        0        1        0     1
        4        1        0        0        1     2
    Total        3        1        1        1     6

$Yes
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        5        0        0        0     5
        2        0        5        0        0     5
        3        0        1        2        0     3
        4        0        0        0        1     1
    Total        5        6        2        1    14

Python Sample data

f1_ and f2_1 with 5 sub groups

df = pd.DataFrame(
  {
    "f1_1": ["Person 1","NA","Person 1","Person 1","Person 1","Person 1","NA","Person 1", "Person 1"],
    "f1_2": ["Person 2","NA","Person 2","Person 2","Person 2","NA","NA","Person 2","Person 2"],
    "f1_3": ["Person 3","NA","NA","Person 3","Person 2","NA","NA","Person 3","NA"],
    "f1_4": ["Person 4","NA","NA","Person 4", "NA","NA","NA","Person 1","NA"],
    "f1_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"],
    "f2_1": ["Yes", "NA", "Yes", "No", "Yes", "No", "NA", "Yes", "Yes"],
    "f2_2": ["Yes", "NA", "Yes", "No", "Yes", "NA", "NA", "Yes", "Yes"],
    "f2_3": ["Yes", "NA", "NA", "No", "Yes", "NA", "NA", "Yes", "NA"],
    "f2_4": ["Yes", "NA", "NA", "No", "NA", "NA", "NA", "No", "NA"],
    "f2_5": ["NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA"]
    
  }
)

R sample data

df <- tibble::tribble(
       ~f1_1,      ~f1_2,      ~f1_3,      ~f1_4, ~f1_5, ~f2_1, ~f2_2, ~f2_3, ~f2_4, ~f2_5,
  "Person 1", "Person 2", "Person 3", "Person 4",    NA, "Yes", "Yes", "Yes", "Yes",    NA,
          NA,         NA,         NA,         NA,    NA,    NA,    NA,    NA,    NA,    NA,
  "Person 1", "Person 2",         NA,         NA,    NA, "Yes", "Yes",    NA,    NA,    NA,
  "Person 1", "Person 2", "Person 3", "Person 4",    NA,  "No",  "No",  "No",  "No",    NA,
  "Person 1", "Person 2", "Person 2",         NA,    NA, "Yes", "Yes", "Yes",    NA,    NA,
  "Person 1",         NA,         NA,         NA,    NA,  "No",    NA,    NA,    NA,    NA,
          NA,         NA,         NA,         NA,    NA,    NA,    NA,    NA,    NA,    NA,
  "Person 1", "Person 2", "Person 3", "Person 1",    NA, "Yes", "Yes", "Yes",  "No",    NA,
  "Person 1", "Person 2",         NA,         NA,    NA, "Yes", "Yes",    NA,    NA,    NA
  )
DanG
  • 689
  • 1
  • 16
  • 39

2 Answers2

1

Let's try:

# convert columns to multi index:
df.columns = pd.MultiIndex.from_tuples(map(tuple,df.columns.str.split('_')))

(df.where(df.ne('NA')).stack()
  .set_index('f1', append=True)
  .groupby(level=(1,2))['f2']
  .value_counts()
  .unstack(['f1'],fill_value=0)
  .assign(total=lambda x: x.sum(1))
)

Output:

  f1   Person 1  Person 2  Person 3  Person 4  total
  f2                                                
1 No          2         0         0         0      2
  Yes         5         0         0         0      5
2 No          0         1         0         0      1
  Yes         0         5         0         0      5
3 No          0         0         1         0      1
  Yes         0         1         2         0      3
4 No          1         0         0         1      2
  Yes         0         0         0         1      1
Quang Hoang
  • 146,074
  • 10
  • 56
  • 74
1

You could also use pivot_longer function from pyjanitor; at the moment you have to install the latest development version from github:

 # install latest dev version
 # pip install git+https://github.com/ericmjl/pyjanitor.git

 import janitor

temp = (
    df.pivot_longer(names_to=(".value", "fs"), names_sep="_")
    .loc[lambda df: df.ne("NA").all(1)]
    .groupby(["fs", "f2", "f1"])
    .f1.agg("count")
    .unstack(fill_value=0)
    .assign(total=lambda df: df.sum(1))
    .swaplevel("f2", "fs")
)

temp

f1      Person 1  Person 2  Person 3  Person 4  total
f2  fs                                               
No  1          2         0         0         0      2
Yes 1          5         0         0         0      5
No  2          0         1         0         0      1
Yes 2          0         5         0         0      5
No  3          0         0         1         0      1
Yes 3          0         1         2         0      3
No  4          1         0         0         1      2
Yes 4          0         0         0         1      1

You can then split it into yes and no dataframes:

yes_df = temp.loc(axis=0)["Yes", :]
yes_df = yes_df.copy()
yes_df.loc[("Yes", "Total"), :] = yes_df.sum()
yes_df

f1         Person 1  Person 2  Person 3  Person 4  total
f2  fs                                                  
Yes 1           5.0       0.0       0.0       0.0    5.0
    2           0.0       5.0       0.0       0.0    5.0
    3           0.0       1.0       2.0       0.0    3.0
    4           0.0       0.0       0.0       1.0    1.0
    Total       5.0       6.0       2.0       1.0   14.0




no_df = temp.loc(axis=0)["No", :]
no_df = no_df.copy()
no_df.loc[("No", "Total"), :] = no_df.sum()
no_df

f1        Person 1  Person 2  Person 3  Person 4  total
f2 fs                                                  
No 1           2.0       0.0       0.0       0.0    2.0
   2           0.0       1.0       0.0       0.0    1.0
   3           0.0       0.0       1.0       0.0    1.0
   4           1.0       0.0       0.0       1.0    2.0
   Total       3.0       1.0       1.0       1.0    6.0

For your code in R, may I suggest some tweaks to your existing code :

df %>% 
  pivot_longer(starts_with('f'), 
               names_to = c('.value', "check_id"), 
               names_sep="_")%>%
  drop_na()%>%
  tabyl(check_id, f1, f2)%>%
  adorn_totals(c("row", "col"))

$No
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        2        0        0        0     2
        2        0        1        0        0     1
        3        0        0        1        0     1
        4        1        0        0        1     2
    Total        3        1        1        1     6

$Yes
 check_id Person 1 Person 2 Person 3 Person 4 Total
        1        5        0        0        0     5
        2        0        5        0        0     5
        3        0        1        2        0     3
        4        0        0        0        1     1
    Total        5        6        2        1    14
sammywemmy
  • 27,093
  • 4
  • 17
  • 31