-1

I will provide the macro below, in short what it does though. It takes a libname and gets all the columns in that libname. For each column name it gets the count(*) and count(columnname) so that we can check for missing values.

Is there a more efficient way to do this? Because of our huge libnames sometimes I make over 1.000 requests to a database and I presume that's not a good thing to do.

Simply doing a select * and temporarely saving it is not an option for a table over 100 milion rows.

Below some sample output:

Tabelnaam       Kolomnaam       Aantal_regels       Aantal_gevulde_regels       Missende_regels     Percentage_gevuld
---------       ---------       -------------       ---------------------       ---------------     -----------------
FCT_sales       ID_BRON         1.000.000           900.000                     100.000             90.00%

FCT_sales       NR_SCHNOT       1.000.000           1.000.000                0                  100.00%

DIM_worker      emp_id          100.000             100.000                  0                   100.00%     
%macro database_null_check
(database= /*LIBREF voor een database, voorbeeld: DMdata*/
);

    /*Aangeven dat de macro begonnen is*/
    %put ---------------------------------------------------------------;

    %put --- Start van %upcase (&sysmacroname) macro;

    %put ---;
    
    /*Elke macro waarde naar de log schrijven*/
    %put --- Macro parameter waardes;

    %put --- database       =       &database;

    %put ---------------------------------------------------------------;
    
    /*Ophalen metadata gegevens van de library*/
    PROC SQL;
        CREATE TABLE &database AS
            SELECT 
                    (libname||'.'||memname) AS Database_tabel
                ,   memname                 AS Tabelnaam
                ,   name                    AS Kolomnaam
            FROM dictionary.columns
            WHERE libname = %UPCASE("&database");
    RUN;

    /*Toevoegen volgnummer om een dynamische do loop te maken*/
    DATA metadata;
        SET &database;
        volgnummer = _N_;
    RUN;

    PROC SQL NOPRINT;
        SELECT MAX(volgnummer) into: aantal_tabelkolommen
        FROM metadata;
    RUN;

    DATA null_controle;
        LENGTH Tabelnaam $40.;
        LENGTH Kolomnaam $40.;
        LENGTH Aantal_regels 8;
        LENGTH Aantal_gevulde_regels 8;
    RUN;

    /*Doorloopt elke kolom in de database en voert hier de acties op uit*/
    %DO i=1 %to &aantal_tabelkolommen;

        PROC SQL NOPRINT;
            SELECT Database_tabel, Tabelnaam, Kolomnaam
            INTO :database_tabel, :tabelnaam, :kolomnaam
            FROM metadata
            WHERE volgnummer = &i.;
        RUN;
        
        %put ------------------;
        %put &database_tabel;
        %put &kolomnaam;
        %put ------------------;

        PROC SQL;
            CREATE TABLE stap1_&i. AS
                SELECT 
                        "&tabelnaam"        AS Tabelnaam
                    ,   "&kolomnaam"        AS Kolomnaam
                    ,   COUNT(*)            AS Aantal_regels
                    ,   COUNT(&kolomnaam)   AS Aantal_gevulde_regels
                FROM &database_tabel;
        RUN;

        DATA stap2_&i.;
            LENGTH Tabelnaam $40.;
            LENGTH Kolomnaam $40.;
            LENGTH Aantal_regels 8;
            LENGTH Aantal_gevulde_regels 8;
            SET stap1_&i.;
        RUN;
        
        PROC APPEND 
            BASE=null_controle
            DATA=stap2_&i.;
        RUN;

        %dsdelete(ds=stap1_&i.);
        %dsdelete(ds=stap2_&i.);
  
    %END;   

    PROC SQL;
        CREATE TABLE output_null_controle AS
            SELECT 
                    Tabelnaam
                ,   Kolomnaam
                ,   aantal_regels                                                       FORMAT COMMA10.0
                ,   aantal_gevulde_regels                                               FORMAT COMMA10.0
                ,   (aantal_regels-aantal_gevulde_regels)       AS Missende_regels      FORMAT COMMA10.0
                ,   (aantal_gevulde_regels/aantal_regels)       AS Percentage_gevuld    FORMAT PERCENT10.2
            FROM null_controle
            WHERE tabelnaam IS NOT NULL;
    RUN;

    /*Opschonen van macro files*/
    %dsdelete(ds=metadata);
    %dsdelete(ds=&database);
    %dsdelete(ds=null_controle);

    /*Aangeven dat de macro klaar is*/
    %put ---------------------------------------------------------------;

    %put --- Einde van %upcase (&sysmacroname) macro;

    %put ---------------------------------------------------------------;

%mend;```

  • This has been asked and answered in previous questions. Scroll through the search results of "[sas] find missing" – Richard Apr 23 '23 at 22:29
  • Unfortunately my answer is a bit more specify than that. My current macro works fine. I am mostly asking for a way to optimize it. Currently for each row it calls the table. Even if it has called it before. Since I am using count to get my results, saving the table has no use. Nor would it be "efficient" as it would take a long time to select * each table. It's possible that this is the most efficient way, but I doubt it. – Jesper van Beemdelust Apr 24 '23 at 05:57

2 Answers2

3

The best way to make it efficient is to only process each dataset once.

You can get the number of missing for each variable easily with SQL query like this that will create a single observation.

proc sql ;
create table misscounts as 
   select sum(missing(name)) as name
        , sum(missing(sex)) as sex
        , sum(missing(age)) as age
   from sashelp.class
;
quit;

Which you can then transpose to get into one observation per variable.

proc transpose data=misscounts out=want;
run;

To generate that code you just need to get the list of of variables. You could use your SQL query or just use PROC CONTENTS instead.

proc contents data=&database.._all_ noprint out=contents; run;

Once you have the list of datasets and variables use that to generate the code.

filename code temp;
data _null_;
  file code;
  set contents;
  by libname memname ;
  if first.memname then put 
      'proc sql;'
    / 'create table misscounts as select' 
    / ' ' memname :$quote. 'as memname length=32'
  ;
  nliteral=nliteral(name);
  put ',sum(missing(' nliteral ')) as ' nliteral ;
  if last.memname then put 
      '  from ' libname +(-1) '.' memname ';'
    / 'quit;'
    / 'proc transpose data=misscounts out=membercounts(rename=(col1=count)) ;'
    / ' by memname;'
    / 'run;'
    / 'proc append base=libnamecounts data=membercounts force;'
    / 'run;'
  ;
run;
%include code / source2;
Tom
  • 47,574
  • 2
  • 16
  • 29
0

One optimizing technique for the task is to use a hash to track counts of rows and numbers of missing values in each variable of a data set.

Example:

Random variable names are used to prevent possible collisions of names in the data sets being interrogated.

%macro missingfreq(data=, out=missingfreq, base=missingfreqall);

%local vvn carray narray name count mcount end;

%let vvn = %sysfunc(getoption(validvarname));
%if &vvn ne ANY %then %do;
  options validvarname = ANY;
%end;

%local carray narray hash hiter unit name count mcount end csentinel nsentinel;
%let carray  = _%substr(%sysfunc(rand(uniform),18.16),3);
%let narray  = _%substr(%sysfunc(rand(uniform),18.16),3);
%let hash    = _%substr(%sysfunc(rand(uniform),18.16),3);
%let hiter   = _%substr(%sysfunc(rand(uniform),18.16),3);
%let unit    = _%substr(%sysfunc(rand(uniform),18.16),3);
%let libname = _%substr(%sysfunc(rand(uniform),18.16),3);
%let memname = _%substr(%sysfunc(rand(uniform),18.16),3);
%let name    = _%substr(%sysfunc(rand(uniform),18.16),3);
%let count   = _%substr(%sysfunc(rand(uniform),18.16),3);
%let mcount  = _%substr(%sysfunc(rand(uniform),18.16),3);
%let end     = _%substr(%sysfunc(rand(uniform),18.16),3);
%let csentinel = _%substr(%sysfunc(rand(uniform),18.16),3);
%let nsentinel = _%substr(%sysfunc(rand(uniform),18.16),3);

%let syslast = &data;

data &out ( 
  keep=&libname &memname &name &count &mcount
  rename=(&libname=libname &memname=memname &name=name &count=count &mcount=mcount)
);
  if 0 then set &data;

  length &csentinel $1 nsentinel 8;

  array &carray _character_;
  array &narray _numeric_;

  length &libname $8 &memname &name $32;

  &libname = "%scan(&syslast,1)";
  &memname = "%scan(&syslast,2)";

  declare hash &hash(suminc:symget('unit'));
  &hash..defineKey("&name");
  &hash..defineDone();

  &unit = 0;
  &count = 0;

  * set up zero counts;
  do _i_ = 1 to dim(&carray)-1;
    &name = vname(&carray);
    &hash..ref();
  end;
  do _i_ = 1 to dim(&narray)-1;
    &name = vname(&narray);
    &hash..ref();
  end;

  &unit = 1;

  * count missings;
  do while (not &end);
    set &data end=&end;
    &count + 1;
    do _i_ = 1 to dim(&carray)-1;
      if missing(&carray) then do;
        &name = vname(&carray);
        &hash..ref();
      end;
    end;
    do _i_ = 1 to dim(&narray)-1;
      if missing(&narray) then do;
        &name = vname(&narray);
        &hash..ref();
      end;
    end;
  end;

  declare hiter &hiter("&hash");
  do while (&hiter..next() = 0);
    &hash..sum(sum:&mcount);
    output;
  end;

  stop;
run;

proc append base=&base data=&out;
run;

%if &vvn ne ANY %then %do;
  options validvarname = &vvn;
%end;


%mend;



data work.one;
  set sashelp.class;
  if mod(_n_,13) = 0 then age = .;
  if mod(_n_,11) = 0 then name = '';
run;

data work.two work.three work.four;
  set sashelp.cars;
  if mod(_n_,13) = 0 then model = '';
  if mod(_n_,23) = 0 then msrp = .;
run;

data work.five;
  set sashelp.baseball(obs=0);
run;



proc datasets noprint lib=work;
  delete missingfreq:;
quit;

options mprint;

data _null_;
  set sashelp.vtable;
  where libname = 'WORK' and memtype='DATA';

  call execute ('%nrstr(%missingfreq) (data=' || catx('.', libname, memname) || ')');

run;
Richard
  • 25,390
  • 3
  • 25
  • 38