1

I've been searching for a question related but I have had no luck as of yet. I am looking to transform a long list of independent variables for a regression analysis. A dummy dataset would look like this:

DATA TEST (DROP = i);
    DO i = 1 to 4000;
        VAR = i + 100000;
        output;
    end;
run;

PROC TRANSPOSE
    DATA = TEST
    OUT = TEST_T
        (DROP = _NAME_)
    PREFIX = X_;
    ID VAR;
    VAR VAR;
RUN;

DATA TEST_ARRAY;
    SET TEST_T;

    ARRAY X[*] X_:;

    DO J = 1 TO 40;
        DO I = 1 TO DIM(X);
            X[I] = RANUNI(0)*I;
            OUTPUT;
        END;
    END;
RUN;

In this case the variable names X_i are increasing monotonically, in reality, my variables are actually X_number where the number is a six digit unique identifier. I've been trying to log transform and square all these variables such that I have a new X matrix with the following columns

X_133456 X_SQ_133456 LOG_X_133456

I tried looping a list through all variables like this

PROC CONTENTS
    DATA = TEST_ARRAY
    OUT = CONTENTS;
RUN;

PROC SQL NOPRINT;
    SELECT NAME INTO: REG_FACTORS
        SEPARATED BY " "
            FROM CONTENTS;
QUIT;

DATA WANT;
SET TEST_ARRAY;
%LET index = 1;
%DO %UNTIL (%SCAN(&REG_factors.,&index.," ")=);
    %LET factors = %SCAN(&REG_factors.,&index.," ");
    LOG_X_&FACTORS. = LOG(X_&FACTORS.);
    X_SQ_&FACTORS. = (X_&FACTORS.) ** 2;
    %LET index = %EVAL(&Index + 1);
%END;
RUN;

but this blows up my server and I need to find a more efficient way of doing this, thanks in advance

EDIT: for the contributor - I managed to solve at 13:04

%LET input_factors = X_:;

PROC SQL;
    SELECT 
            NAME
        ,   TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'SQ')
        ,   TRANWRD(NAME,%SCAN(&input_factors.,1,'_'),'LOG')
    INTO        
            :factor_list        separated by " "
        ,   :sq_factor_list     separated by " "
        ,   :log_factor_list    separated by " "
    FROM
        contents
    WHERE
        VARNUM < 5
    WHERE
        NAME LIKE "%SCAN(&input_factors.,1,'_')_"
    ORDER BY
        INPUT(SCAN(NAME,-1,'_'),8.)
    ;
QUIT;

%PUT &factor_list.;
%PUT &sq_factor_list.;
%PUT &log_factor_list.;
78282219
  • 159
  • 1
  • 12
  • What is it that is having trouble? Is it generating the dataset with three times the variables? Or using the new dataset in the analysis? – Tom Nov 17 '18 at 18:31
  • Creating a dataset with 3x the number of variables – 78282219 Nov 17 '18 at 19:23

3 Answers3

0

I've gone with this, however, I think efficiency can be discussed before an answer is approved

%LET TRANSFORM_Y        = NO;
%LET TRANSFORM_X_SQ     = YES;
%LET TRANSFORM_LOG      = YES;
%MACRO TEST;
    DATA TEST (DROP = i);
        DO i = 1 to 40000;
            VAR = i + 100000;
            output;
        end;
    run;

    PROC TRANSPOSE
        DATA = TEST
        OUT = TEST_T
            (DROP = _NAME_)
        PREFIX = X_;
        ID VAR;
        VAR VAR;
    RUN;

    DATA TEST_ARRAY;
        SET TEST_T;

        ARRAY X[*] X_:;

            DO I = 1 TO DIM(X);
                X[I] = RANUNI(0)*I;
                OUTPUT;
            END;
    RUN;

    DATA TEST_ARRAY_2;
        SET TEST_ARRAY;

        Y = RANUNI(0);
        DROP I J;
        ROW_NUM = _N_;
    RUN;

    PROC TRANSPOSE
        DATA = TEST_ARRAY_2
            (DROP = ROW_NUM)
        OUT  = TEST_ARRAY_T
        ;
    RUN;
    %IF &TRANSFORM_X_SQ. = YES %THEN %DO;
        DATA TESTING_X_SQ
            (DROP = I);
            SET TEST_ARRAY_T;
            ARRAY COL[*] COL:;
                DO I = 1 TO DIM(COL);
                    COL(I) = COL(I)**2;
                END;
            Row_num = _N_;
        RUN;

        PROC TRANSPOSE
            DATA = TESTING_X_SQ
            OUT  = X_SQ_T
                (DROP = _NAME_)
            PREFIX = SQ_
            ;
            ID _NAME_
            ;
        RUN;

        DATA X_SQ_T_2;
            SET X_SQ_T;
            ROW_NUM = _N_;
        RUN;
    %END;   
    %IF &TRANSFORM_LOG. = YES %THEN %DO;
        DATA TESTING_LOG;
            SET TEST_ARRAY_T;
            ARRAY COL[*] COL:;
                DO I = 1 TO DIM(COL);
                    COL(I) = LOG(COL(I));
                END;
        RUN;

        PROC TRANSPOSE
            DATA = TESTING_LOG
            OUT  = LOG_T
            PREFIX = LOG_
            ;
            ID _NAME_
            ;
        RUN;

        DATA LOG_T_2;
            SET LOG_T;
            ROW_NUM = _N_;
        RUN;
    %END;

    PROC SQL;
        CREATE TABLE FULL_DATA AS
        SELECT
                f.*
            %IF &TRANSFORM_X_SQ.    = YES %THEN %DO;
            ,   x.*
            %END;
            %IF &TRANSFORM_LOG.     = YES %THEN %DO;
            ,   l.*
            %END;
        FROM 
            TEST_ARRAY_2    f
        %IF &TRANSFORM_X_SQ.    = YES %THEN %DO;
        LEFT JOIN
            X_SQ_T_2        x       ON f.row_num = x.row_num
        %END;
        %IF &TRANSFORM_LOG.     = YES %THEN %DO;
        LEFT JOIN
            LOG_T_2         l       ON l.row_num = x.row_num
        %END;
        ;
    QUIT;
%MEND;
%TEST;
78282219
  • 159
  • 1
  • 12
  • That's a whole lot a extraneous disk i/o when the array based solution can do the transforms in a single pass. How many `x_` variables do you actually have ? A single macro variable can hold 65,534 characters. – Richard Nov 18 '18 at 22:59
  • I only have 300 good factors, the rest of quite poor but it is good to pass them through the analysis still, i do run out of memory at times – 78282219 Nov 19 '18 at 15:25
0

Use 3 arrays, one for the input values (e.g. X_31415), and two for the new calculated values (log and square).

The trick is to dynamically generate the variable names for the calculated variables, based on the original variable names.

/* Use dictionary table to get/generate vnames */
proc sql ;
  select name, /* X_31415 */
         tranwrd(name,'X_','X_SQ_'), /* X_SQ_31415 */
         tranwrd(name,'X_','LOG_X_') /* LOG_X_31415 */
    into :VARLIST separated by ' ',
         :SQLIST separated by ' ',
         :LOGLIST separated by ' '
  from dictionary.columns
  where libname = 'WORK'
    and memname = 'MYDATA'
    and name like 'X_%'
  order by input(scan(name,-1,'_'),8.) /* order based on the numeric suffix */
  ;
quit ;

Now you can assign three arrays, looping over the input values and calculate the square and log accordingly.

data array3 ;
  set mydata ;

  array in{*} &VARLIST ; /* X_1     X_17     X_31415     X_99999     */
  array sq{*} &SQLIST ;  /* X_SQ_1  X_SQ_17  X_SQ_31415  X_SQ_99999  */
  array lg{*} &LOGLIST ; /* LOG_X_1 LOG_X_17 LOG_X_31415 LOG_X_99999 */

  do i = 1 to dim(in) ;
    sq{i} = in{i} ** 2 ;
    lg{i} = log(in{i}) ;
  end ;

  drop i ;
run ;
Chris J
  • 7,549
  • 2
  • 25
  • 25
  • This, this is good, I was trying to make 3 arrays but couldn't think of a way to create them without having the dimensions set, thanks – 78282219 Nov 18 '18 at 09:26
  • Hey man, as you can see, the input factors can have any prefix and so, I'm trying to generalise your code above, I've tried this but having no luck (see edit in post) – 78282219 Nov 18 '18 at 11:58
  • DW , i fixed it – 78282219 Nov 18 '18 at 12:04
0

When there is a vast number of variables, you may need to use SAS File I/O functions to iterate over the variables. This example creates a data set with 55,000 response variables and computes the square and log transforms of them.

%macro make_have(nvar=10);
  %local dsid suffix;

  data cols;
    do index = 100000 to 999999;
      if ranuni(123) < &nvar / (1e6-1e5) then output;
    end;
  run;

  data have;
    do id = 1 to 10;
      sex = ceil(2*ranuni(123));
      age = 17 + ceil(52*ranuni(123));
      weight = 150 + ceil(100*ranuni(123));

      %let dsid = %sysfunc(open (cols));
      %do %while (0 = %sysfunc(fetch(&dsid)));
        %let suffix = %sysfunc(getvarn(&dsid,1));
        x_&suffix = ranuni(123);
      %end;
      %let dsid = %sysfunc(close(&dsid));

      output;
    end;
  run;

%mend;

options nomprint;

%make_have(nvar=55000);

%macro make_transforms(data=, vars=, new=, function=);
  %local dsid i nvar varname;
  %let dsid = %sysfunc(open (&data));
  %do i = 1 %to %sysfunc(attrn(&dsid,nvar));
    %let varname = %sysfunc(varname(&dsid,&i));
    %if %substr(&varname,1,%length(&vars)) = &vars %then %do;
      &new.%substr(&varname,%length(&vars)+1) = %sysfunc(tranwrd(&function,#,&varname));
    %end;
  %end;
  %let dsid = %sysfunc(close(&dsid));
%mend;

data want;
  set have;
  %let t0 = %sysfunc(datetime());
  %make_transforms(data=have, vars=x_, new=x_sq_,  function=#**2)
  %make_transforms(data=have, vars=x_, new=x_log_, function=log(#))
  %put NOTE: codegen elapsed: %sysevalf(%sysfunc(datetime())-&t0);
run;
Richard
  • 25,390
  • 3
  • 25
  • 38