Production Ready Macros for SAS Application Developers
https://github.com/sasjs/core
mp_guesspk.sas
Go to the documentation of this file.
1 /**
2  @file mp_guesspk.sas
3  @brief Guess the primary key of a table
4  @details Tries to guess the primary key of a table based on the following logic:
5 
6  * Columns with nulls are ignored
7  * Return only column combinations that provide unique results
8  * Start from one column, then move out to include composite keys of 2 to 6 columns
9 
10  The library of the target should be assigned before using this macro.
11 
12  Usage:
13 
14  filename mc url "https://raw.githubusercontent.com/sasjs/core/main/all.sas";
15  %inc mc;
16  %mp_guesspk(sashelp.class,outds=classpks)
17 
18  @param baseds The dataset to analyse
19  @param outds= The output dataset to contain the possible PKs
20  @param max_guesses= The total number of possible primary keys to generate. A
21  table is likely to have multiple unlikely PKs, so no need to list them all. Default=3.
22  @param min_rows= The minimum number of rows a table should have in order to try
23  and guess the PK. Default=5.
24 
25  <h4> Dependencies </h4>
26  @li mf_getvarlist.sas
27  @li mf_getuniquename.sas
28  @li mf_nobs.sas
29 
30  @version 9.3
31  @author Allan Bowe
32 
33 **/
34 
35 %macro mp_guesspk(baseds
36  ,outds=mp_guesspk
37  ,max_guesses=3
38  ,min_rows=5
39 )/*/STORE SOURCE*/;
40 
41  /* declare local vars */
42  %local var vars vcnt i j k l tmpvar tmpds rows posspks ppkcnt;
43  %let vars=%mf_getvarlist(&baseds);
44  %let vcnt=%sysfunc(countw(&vars));
45 
46  %if &vcnt=0 %then %do;
47  %put &sysmacroname: &baseds has no variables! Exiting.;
48  %return;
49  %end;
50 
51  /* get null count and row count */
52  %let tmpvar=%mf_getuniquename();
53  proc sql noprint;
54  create table _data_ as select
55  count(*) as &tmpvar
56  %do i=1 %to &vcnt;
57  %let var=%scan(&vars,&i);
58  ,sum(case when &var is missing then 1 else 0 end) as &var
59  %end;
60  from &baseds;
61 
62  /* transpose table and scan for not null cols */
63  proc transpose;
64  data _null_;
65  set &syslast end=last;
66  length vars $32767;
67  retain vars ;
68  if _name_="&tmpvar" then call symputx('rows',col1,'l');
69  else if col1=0 then vars=catx(' ',vars,_name_);
70  if last then call symputx('posspks',vars,'l');
71  run;
72 
73  %let ppkcnt=%sysfunc(countw(&posspks));
74  %if &ppkcnt=0 %then %do;
75  %put &sysmacroname: &baseds has no non-missing variables! Exiting.;
76  %return;
77  %end;
78 
79  proc sort data=&baseds(keep=&posspks) out=_data_ noduprec;
80  by _all_;
81  run;
82  %local pkds; %let pkds=&syslast;
83 
84  %if &rows > %mf_nobs(&pkds) %then %do;
85  %put &sysmacroname: &baseds has no combination of unique records! Exiting.;
86  %return;
87  %end;
88 
89  /* now check cardinality */
90  proc sql noprint;
91  create table _data_ as select
92  %do i=1 %to &ppkcnt;
93  %let var=%scan(&posspks,&i);
94  count(distinct &var) as &var
95  %if &i<&ppkcnt %then ,;
96  %end;
97  from &pkds;
98 
99  /* transpose and sort by cardinality */
100  proc transpose;
101  proc sort; by descending col1;
102  run;
103 
104  /* create initial PK list and re-order posspks list */
105  data &outds(keep=pkguesses);
106  length pkguesses $5000 vars $5000;
107  set &syslast end=last;
108  retain vars ;
109  vars=catx(' ',vars,_name_);
110  if col1=&rows then do;
111  pkguesses=_name_;
112  output;
113  end;
114  if last then call symputx('posspks',vars,'l');
115  run;
116 
117  %if %mf_nobs(&outds) ge &max_guesses %then %do;
118  %put &sysmacroname: %mf_nobs(&outds) possible primary key values found;
119  %return;
120  %end;
121 
122  %if &ppkcnt=1 %then %do;
123  %put &sysmacroname: No more PK guess possible;
124  %return;
125  %end;
126 
127  /* begin scanning for uniques on pairs of PKs */
128  %let tmpds=%mf_getuniquename();
129  %local lev1 lev2;
130  %do i=1 %to &ppkcnt;
131  %let lev1=%scan(&posspks,&i);
132  %do j=2 %to &ppkcnt;
133  %let lev2=%scan(&posspks,&j);
134  %if &lev1 ne &lev2 %then %do;
135  /* check for two level uniqueness */
136  proc sort data=&pkds(keep=&lev1 &lev2) out=&tmpds noduprec;
137  by _all_;
138  run;
139  %if %mf_nobs(&tmpds)=&rows %then %do;
140  proc sql;
141  insert into &outds values("&lev1 &lev2");
142  %if %mf_nobs(&outds) ge &max_guesses %then %do;
143  %put &sysmacroname: Max PKs reached at Level 2 for &baseds;
144  %return;
145  %end;
146  %end;
147  %end;
148  %end;
149  %end;
150 
151  %if &ppkcnt=2 %then %do;
152  %put &sysmacroname: No more PK guess possible;
153  %return;
154  %end;
155 
156  /* begin scanning for uniques on PK triplets */
157  %local lev3;
158  %do i=1 %to &ppkcnt;
159  %let lev1=%scan(&posspks,&i);
160  %do j=2 %to &ppkcnt;
161  %let lev2=%scan(&posspks,&j);
162  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
163  %let lev3=%scan(&posspks,&k);
164  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do;
165  /* check for three level uniqueness */
166  proc sort data=&pkds(keep=&lev1 &lev2 &lev3) out=&tmpds noduprec;
167  by _all_;
168  run;
169  %if %mf_nobs(&tmpds)=&rows %then %do;
170  proc sql;
171  insert into &outds values("&lev1 &lev2 &lev3");
172  %if %mf_nobs(&outds) ge &max_guesses %then %do;
173  %put &sysmacroname: Max PKs reached at Level 3 for &baseds;
174  %return;
175  %end;
176  %end;
177  %end;
178  %end;
179  %end;
180  %end;
181 
182  %if &ppkcnt=3 %then %do;
183  %put &sysmacroname: No more PK guess possible;
184  %return;
185  %end;
186 
187  /* scan for uniques on up to 4 PK fields */
188  %local lev4;
189  %do i=1 %to &ppkcnt;
190  %let lev1=%scan(&posspks,&i);
191  %do j=2 %to &ppkcnt;
192  %let lev2=%scan(&posspks,&j);
193  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
194  %let lev3=%scan(&posspks,&k);
195  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
196  %let lev4=%scan(&posspks,&l);
197  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then %do;
198  /* check for four level uniqueness */
199  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4) out=&tmpds noduprec;
200  by _all_;
201  run;
202  %if %mf_nobs(&tmpds)=&rows %then %do;
203  proc sql;
204  insert into &outds values("&lev1 &lev2 &lev3 &lev4");
205  %if %mf_nobs(&outds) ge &max_guesses %then %do;
206  %put &sysmacroname: Max PKs reached at Level 4 for &baseds;
207  %return;
208  %end;
209  %end;
210  %end;
211  %end;
212  %end;
213  %end;
214  %end;
215 
216  %if &ppkcnt=4 %then %do;
217  %put &sysmacroname: No more PK guess possible;
218  %return;
219  %end;
220 
221  /* scan for uniques on up to 4 PK fields */
222  %local lev5 m;
223  %do i=1 %to &ppkcnt;
224  %let lev1=%scan(&posspks,&i);
225  %do j=2 %to &ppkcnt;
226  %let lev2=%scan(&posspks,&j);
227  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
228  %let lev3=%scan(&posspks,&k);
229  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
230  %let lev4=%scan(&posspks,&l);
231  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
232  %do m=5 %to &ppkcnt;
233  %let lev5=%scan(&posspks,&m);
234  %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then %do;
235  /* check for four level uniqueness */
236  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5) out=&tmpds noduprec;
237  by _all_;
238  run;
239  %if %mf_nobs(&tmpds)=&rows %then %do;
240  proc sql;
241  insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5");
242  %if %mf_nobs(&outds) ge &max_guesses %then %do;
243  %put &sysmacroname: Max PKs reached at Level 5 for &baseds;
244  %return;
245  %end;
246  %end;
247  %end;
248  %end;
249  %end;
250  %end;
251  %end;
252  %end;
253 
254  %if &ppkcnt=5 %then %do;
255  %put &sysmacroname: No more PK guess possible;
256  %return;
257  %end;
258 
259  /* scan for uniques on up to 4 PK fields */
260  %local lev6 n;
261  %do i=1 %to &ppkcnt;
262  %let lev1=%scan(&posspks,&i);
263  %do j=2 %to &ppkcnt;
264  %let lev2=%scan(&posspks,&j);
265  %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
266  %let lev3=%scan(&posspks,&k);
267  %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
268  %let lev4=%scan(&posspks,&l);
269  %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
270  %do m=5 %to &ppkcnt;
271  %let lev5=%scan(&posspks,&m);
272  %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then
273  %do n=6 %to &ppkcnt;
274  %let lev6=%scan(&posspks,&n);
275  %if &lev1 ne &lev6 & &lev2 ne &lev6 & &lev3 ne &lev6
276  & &lev4 ne &lev6 & &lev5 ne &lev6 %then
277  %do;
278  /* check for four level uniqueness */
279  proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5 &lev6)
280  out=&tmpds noduprec;
281  by _all_;
282  run;
283  %if %mf_nobs(&tmpds)=&rows %then %do;
284  proc sql;
285  insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5 &lev6");
286  %if %mf_nobs(&outds) ge &max_guesses %then %do;
287  %put &sysmacroname: Max PKs reached at Level 6 for &baseds;
288  %return;
289  %end;
290  %end;
291  %end;
292  %end;
293  %end;
294  %end;
295  %end;
296  %end;
297  %end;
298 
299  %if &ppkcnt=6 %then %do;
300  %put &sysmacroname: No more PK guess possible;
301  %return;
302  %end;
303 
304 %mend;