I tried implementing a solution into my sas code but with no luck. I'm trying to add a jaccard distance column. to my dataset. I keep getting errors : variable name & is not valid invalid value for the keep option The idea is to solve a matching problem between two datasets and to take into consideration the typing errors.
data table_test;
input nom1 $3. nom2 $3.;
cards;
abcade
vdenfr
azfefs
;
run;
%macro kshingling
(string
,k=5
,out=&sysmacroname.
)
;
data &out.;
string = strip(prxchange('s#\s# #',-1,symget('string')));
do _n_ = 1 to lengthn(string)-&k.+1;
ngram = substr(string,_n_,&k.);
output;
end;
run;
%mend;
%macro jaccard
(string1
,string2
)
;
%kshingling(&string1.,k=2,out=s1)
%kshingling(&string2.,k=2,out=s2)
proc append base=s1 data=s2; run;
proc freq data=s1 noprint;
tables string*ngram / out=s2;
run;
proc transpose data=s2 out=s1(drop=_name_ _label_);
by string notsorted;
var count;
id ngram;
run;
proc stdize data=s1 out=s2 missing=0 reponly;
var _numeric_;
run;
proc distance data=s2 method=jaccard absent=0 out=s1;
var anominal(_numeric_);
id string;
run;
data t(keep=&string1.);
set s1(firstobs=2);
run;
data _null_;
set t;
call symput('Jaccard',&string1.);
%put Distance de Jaccard = &Jaccard;
run;
%mend;
data test;
set table_test;
call symput('n1',nom1);
call symput('n2',nom2);
%jaccard(&n1,&n2);
run;
data Jacc;
Dist_Jacc=&Jaccard;
run;
data Final; merge table_test Jacc; run;