Production Ready Macros for SAS Application Developers
https://github.com/sasjs/core
mp_guesspk.sas
Go to the documentation of this file.
1/**
2 @file mp_guesspk.sas
3 @brief Guess the primary key of a table
4 @details Tries to guess the primary key of a table based on the following logic:
5
6 * Columns with nulls are ignored
7 * Return only column combinations that provide unique results
8 * Start from one column, then move out to include composite keys of 2 to 6 columns
9
10 The library of the target should be assigned before using this macro.
11
12 Usage:
13
14 filename mc url "https://raw.githubusercontent.com/sasjs/core/main/all.sas";
15 %inc mc;
16 %mp_guesspk(sashelp.class,outds=classpks)
17
18 @param baseds The dataset to analyse
19 @param outds= The output dataset to contain the possible PKs
20 @param max_guesses= The total number of possible primary keys to generate. A
21 table is likely to have multiple unlikely PKs, so no need to list them all. Default=3.
22 @param min_rows= The minimum number of rows a table should have in order to try
23 and guess the PK. Default=5.
24
25 <h4> SAS Macros </h4>
26 @li mf_getvarlist.sas
27 @li mf_getuniquename.sas
28 @li mf_nobs.sas
29
30 @version 9.3
31 @author Allan Bowe
32
33**/
34
35%macro mp_guesspk(baseds
36 ,outds=mp_guesspk
37 ,max_guesses=3
38 ,min_rows=5
39)/*/STORE SOURCE*/;
40
41 /* declare local vars */
42 %local var vars vcnt i j k l tmpvar tmpds rows posspks ppkcnt;
43 %let vars=%mf_getvarlist(&baseds);
44 %let vcnt=%sysfunc(countw(&vars));
45
46 %if &vcnt=0 %then %do;
47 %put &sysmacroname: &baseds has no variables! Exiting.;
48 %return;
49 %end;
50
51 /* get null count and row count */
52 %let tmpvar=%mf_getuniquename();
53 proc sql noprint;
54 create table _data_ as select
55 count(*) as &tmpvar
56 %do i=1 %to &vcnt;
57 %let var=%scan(&vars,&i);
58 ,sum(case when &var is missing then 1 else 0 end) as &var
59 %end;
60 from &baseds;
61
62 /* transpose table and scan for not null cols */
63 proc transpose;
64 data _null_;
65 set &syslast end=last;
66 length vars $32767;
67 retain vars ;
68 if _name_="&tmpvar" then call symputx('rows',col1,'l');
69 else if col1=0 then vars=catx(' ',vars,_name_);
70 if last then call symputx('posspks',vars,'l');
71 run;
72
73 %let ppkcnt=%sysfunc(countw(&posspks));
74 %if &ppkcnt=0 %then %do;
75 %put &sysmacroname: &baseds has no non-missing variables! Exiting.;
76 %return;
77 %end;
78
79 proc sort data=&baseds(keep=&posspks) out=_data_ noduprec;
80 by _all_;
81 run;
82 %local pkds; %let pkds=&syslast;
83
84 %if &rows > %mf_nobs(&pkds) %then %do;
85 %put &sysmacroname: &baseds has no combination of unique records! Exiting.;
86 %return;
87 %end;
88
89 /* now check cardinality */
90 proc sql noprint;
91 create table _data_ as select
92 %do i=1 %to &ppkcnt;
93 %let var=%scan(&posspks,&i);
94 count(distinct &var) as &var
95 %if &i<&ppkcnt %then ,;
96 %end;
97 from &pkds;
98
99 /* transpose and sort by cardinality */
100 proc transpose;
101 proc sort; by descending col1;
102 run;
103
104 /* create initial PK list and re-order posspks list */
105 data &outds(keep=pkguesses);
106 length pkguesses $5000 vars $5000;
107 set &syslast end=last;
108 retain vars ;
109 vars=catx(' ',vars,_name_);
110 if col1=&rows then do;
111 pkguesses=_name_;
112 output;
113 end;
114 if last then call symputx('posspks',vars,'l');
115 run;
116
117 %if %mf_nobs(&outds) ge &max_guesses %then %do;
118 %put &sysmacroname: %mf_nobs(&outds) possible primary key values found;
119 %return;
120 %end;
121
122 %if &ppkcnt=1 %then %do;
123 %put &sysmacroname: No more PK guess possible;
124 %return;
125 %end;
126
127 /* begin scanning for uniques on pairs of PKs */
128 %let tmpds=%mf_getuniquename();
129 %local lev1 lev2;
130 %do i=1 %to &ppkcnt;
131 %let lev1=%scan(&posspks,&i);
132 %do j=2 %to &ppkcnt;
133 %let lev2=%scan(&posspks,&j);
134 %if &lev1 ne &lev2 %then %do;
135 /* check for two level uniqueness */
136 proc sort data=&pkds(keep=&lev1 &lev2) out=&tmpds noduprec;
137 by _all_;
138 run;
139 %if %mf_nobs(&tmpds)=&rows %then %do;
140 proc sql;
141 insert into &outds values("&lev1 &lev2");
142 %if %mf_nobs(&outds) ge &max_guesses %then %do;
143 %put &sysmacroname: Max PKs reached at Level 2 for &baseds;
144 %return;
145 %end;
146 %end;
147 %end;
148 %end;
149 %end;
150
151 %if &ppkcnt=2 %then %do;
152 %put &sysmacroname: No more PK guess possible;
153 %return;
154 %end;
155
156 /* begin scanning for uniques on PK triplets */
157 %local lev3;
158 %do i=1 %to &ppkcnt;
159 %let lev1=%scan(&posspks,&i);
160 %do j=2 %to &ppkcnt;
161 %let lev2=%scan(&posspks,&j);
162 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
163 %let lev3=%scan(&posspks,&k);
164 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do;
165 /* check for three level uniqueness */
166 proc sort data=&pkds(keep=&lev1 &lev2 &lev3) out=&tmpds noduprec;
167 by _all_;
168 run;
169 %if %mf_nobs(&tmpds)=&rows %then %do;
170 proc sql;
171 insert into &outds values("&lev1 &lev2 &lev3");
172 %if %mf_nobs(&outds) ge &max_guesses %then %do;
173 %put &sysmacroname: Max PKs reached at Level 3 for &baseds;
174 %return;
175 %end;
176 %end;
177 %end;
178 %end;
179 %end;
180 %end;
181
182 %if &ppkcnt=3 %then %do;
183 %put &sysmacroname: No more PK guess possible;
184 %return;
185 %end;
186
187 /* scan for uniques on up to 4 PK fields */
188 %local lev4;
189 %do i=1 %to &ppkcnt;
190 %let lev1=%scan(&posspks,&i);
191 %do j=2 %to &ppkcnt;
192 %let lev2=%scan(&posspks,&j);
193 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
194 %let lev3=%scan(&posspks,&k);
195 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
196 %let lev4=%scan(&posspks,&l);
197 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then %do;
198 /* check for four level uniqueness */
199 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4) out=&tmpds noduprec;
200 by _all_;
201 run;
202 %if %mf_nobs(&tmpds)=&rows %then %do;
203 proc sql;
204 insert into &outds values("&lev1 &lev2 &lev3 &lev4");
205 %if %mf_nobs(&outds) ge &max_guesses %then %do;
206 %put &sysmacroname: Max PKs reached at Level 4 for &baseds;
207 %return;
208 %end;
209 %end;
210 %end;
211 %end;
212 %end;
213 %end;
214 %end;
215
216 %if &ppkcnt=4 %then %do;
217 %put &sysmacroname: No more PK guess possible;
218 %return;
219 %end;
220
221 /* scan for uniques on up to 4 PK fields */
222 %local lev5 m;
223 %do i=1 %to &ppkcnt;
224 %let lev1=%scan(&posspks,&i);
225 %do j=2 %to &ppkcnt;
226 %let lev2=%scan(&posspks,&j);
227 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
228 %let lev3=%scan(&posspks,&k);
229 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
230 %let lev4=%scan(&posspks,&l);
231 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
232 %do m=5 %to &ppkcnt;
233 %let lev5=%scan(&posspks,&m);
234 %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then %do;
235 /* check for four level uniqueness */
236 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5) out=&tmpds noduprec;
237 by _all_;
238 run;
239 %if %mf_nobs(&tmpds)=&rows %then %do;
240 proc sql;
241 insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5");
242 %if %mf_nobs(&outds) ge &max_guesses %then %do;
243 %put &sysmacroname: Max PKs reached at Level 5 for &baseds;
244 %return;
245 %end;
246 %end;
247 %end;
248 %end;
249 %end;
250 %end;
251 %end;
252 %end;
253
254 %if &ppkcnt=5 %then %do;
255 %put &sysmacroname: No more PK guess possible;
256 %return;
257 %end;
258
259 /* scan for uniques on up to 4 PK fields */
260 %local lev6 n;
261 %do i=1 %to &ppkcnt;
262 %let lev1=%scan(&posspks,&i);
263 %do j=2 %to &ppkcnt;
264 %let lev2=%scan(&posspks,&j);
265 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
266 %let lev3=%scan(&posspks,&k);
267 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
268 %let lev4=%scan(&posspks,&l);
269 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
270 %do m=5 %to &ppkcnt;
271 %let lev5=%scan(&posspks,&m);
272 %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then
273 %do n=6 %to &ppkcnt;
274 %let lev6=%scan(&posspks,&n);
275 %if &lev1 ne &lev6 & &lev2 ne &lev6 & &lev3 ne &lev6
276 & &lev4 ne &lev6 & &lev5 ne &lev6 %then
277 %do;
278 /* check for four level uniqueness */
279 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5 &lev6)
280 out=&tmpds noduprec;
281 by _all_;
282 run;
283 %if %mf_nobs(&tmpds)=&rows %then %do;
284 proc sql;
285 insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5 &lev6");
286 %if %mf_nobs(&outds) ge &max_guesses %then %do;
287 %put &sysmacroname: Max PKs reached at Level 6 for &baseds;
288 %return;
289 %end;
290 %end;
291 %end;
292 %end;
293 %end;
294 %end;
295 %end;
296 %end;
297 %end;
298
299 %if &ppkcnt=6 %then %do;
300 %put &sysmacroname: No more PK guess possible;
301 %return;
302 %end;
303
304%mend mp_guesspk;