0

Below is the sample data set and the desired manipulations. As of yet, all works fine. Attempting to create a new calculated column. Some context, the smb stands for small business. 1,2,3,4 represent differing thresholds of what would be considered small. The desired column would what percentage of total employment is taken up by smb = 1 for a given area, for example. For area 001, this would be 46/1927 for example. I can figure out how to have it to be appear once but not as a complete column. How would I go about doing this? The desired result is at the bottom.

library(readxl)
library(dplyr)
library(data.table)
library(DBI)
library(stringr)
library(tidyverse)
library(gt)


 employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
 small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
 area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
 year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
 qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)

 smbtest <- data.frame(employment,small,area,year,qtr)

 smbtest$smb <-0

 smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))


 smbsummary2<-smbtest %>% 
 mutate(period = paste0(year,"q",qtr)) %>%
 group_by(area,period,smb) %>%
 summarise(employment = sum(employment), worksites = n(), 
        .groups = 'drop_last') %>% 
 mutate(employment = cumsum(employment),
     worksites = cumsum(worksites))

 smbsummary2<- smbsummary2%>%
 group_by(area,smb)%>%
 mutate(empprevyear=lag(employment),
     empprevyearpp=employment-empprevyear,
     empprevyearpct=((employment/empprevyear)-1), 
 empprevyearpct=scales::percent(empprevyearpct,accuracy = 0.01)
 )




smblonger2<-smbsummary2 %>%
dplyr::select(area,period,employment,worksites,smb) %>%
ungroup() %>%
pivot_longer(cols = employment:worksites, names_to = "measure", values_to = "value") %>%
group_by(area,measure) %>%
pivot_wider(names_from = period, values_from = value)%>%filter(smb %in% 
c("1","2","3","4"))%>%gt()%>%cols_label(
smb = md("**Category**"))


smblonger2

area    period   smb    employment    worksites    pcttotal
 1      2020q1    1         46           2          46/1927 (total employment)
 2      2020q2    2        301           4          301/1927
 3      2020q3    3        466           5          466/1927
 4      2020q4    4        726           6          726/1927

 schema
 smb      employment range
  1         0 to 100
  2         0 to 150
  3         0 to 250
  4         0 to 500
Tim Wilcox
  • 1,275
  • 2
  • 19
  • 43
  • I'm not at all clear what you are asking. But 1. Case when - those cases overlap if employment = 125, is that 1 or 2?. 2. Can you explain what you are trying to achieve as a result? – CALUM Polwart Apr 14 '21 at 21:31
  • @CALUMPolwart, question has been edited. – Tim Wilcox Apr 14 '21 at 21:59
  • 1
    What have you tried? %>% mutate(your_answer = employment/ sum(employment)). I'm still unclear on what it is you are asking? Are you even wanting the result as 46/1927 or are you wanting it calculated. – CALUM Polwart Apr 14 '21 at 22:10
  • @CALUMPolwart I want the calculation. I put the 46/1927 just so it is more concrete. This new column pcttotal would show you what percent of total employment is connected to smb 1 (0 to 100) for a given period and area. – Tim Wilcox Apr 15 '21 at 00:19

1 Answers1

2

OK so here is my solution (someone will now come along with a 1 line function!)

library(dplyr)
library(tidyr)

employment <- c(1,45,125,130,165,260,600,601,2,46,127,132,167,265,601,602,50,61,110,121,170,305,55,603,52,66,112,123,172,310,604,605)
small <- c(1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA,1,1,2,2,3,4,NA,NA)
area <-c(001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,001,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003,003)
year<-c(2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020)
qtr <-c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2)

smbtest <- data.frame(employment,small,area,year,qtr)

smbtest$smb <-0  # I think this line is redundent

smbtest <- smbtest %>% mutate(smb = case_when(employment >=0 & employment <100 ~ "1",employment >=0 & employment <150 ~ "2",employment >=0 & employment <250 ~ "3", employment >=0 & employment <500 ~ "4"))

smbsummary2<-smbtest %>% 
    mutate(period = paste0(year,"q",qtr)) %>%
    group_by(area,period,smb) %>%
    summarise(employment = sum(employment), worksites = n(), 
              .groups = 'drop_last') %>% 
    mutate(employment = cumsum(employment),
           worksites = cumsum(worksites))



smbsummary2 %>%
    # Make the data wider (a column for each smb)
    pivot_wider(
        id_cols=c("area", "period"), 
        names_from = "smb", 
        values_from = c("employment", "worksites"),
        names_prefix = "SMB"
        ) %>%
    # calculate the %
    mutate(across(starts_with("employment_SMB"), 
                  ~(100*(.x/employment_SMBNA)),
                  .names = "pcttotal_{.col}")) %>%

    # Now make the data longer
    pivot_longer(
        cols = contains("SMB")
    ) %>%
    # rework the data names so the smb is a value
    separate(name, into=c("name", "smb"), sep="_SMB") %>%
    # Make the date wider again to match the shape requested
    pivot_wider(
        id_cols=c("area", "period", "smb"), 
        names_from = "name", 
        values_from = "value"
    ) -> smbsummary3

CALUM Polwart
  • 497
  • 3
  • 5