Macros for SAS Application Developers
https://github.com/sasjs/core
Loading...
Searching...
No Matches
mp_guesspk.sas
Go to the documentation of this file.
1/**
2 @file
3 @brief Guess the primary key of a table
4 @details Tries to guess the primary key of a table based on the following
5 logic:
6
7 * Columns with nulls are ignored
8 * Return only column combinations that provide unique results
9 * Start from one column, then move out to composite keys of 2 to 6 columns
10
11 The library of the target should be assigned before using this macro.
12
13 Usage:
14
15 filename mc url
16 "https://raw.githubusercontent.com/sasjs/core/main/all.sas";
17 %inc mc;
18 %mp_guesspk(sashelp.class,outds=classpks)
19
20 @param [in] baseds The dataset to analyse
21 @param [out] outds= (mp_guesspk) Output dataset to contain the possible PKs
22 @param [in] max_guesses= (3) The total number of possible primary keys to
23 generate. A table may have multiple (unlikely) PKs, so no need to list them
24 all.
25 @param [in] min_rows= (5) The minimum number of rows a table should have in
26 order to try and guess the PK.
27 @param [in] ignore_cols (0) Space seperated list of columns which you are
28 sure are not part of the primary key (helps to avoid false positives)
29 @param [in] mdebug= Set to 1 to enable DEBUG messages and preserve outputs
30
31 <h4> SAS Macros </h4>
32 @li mf_getvarlist.sas
33 @li mf_getuniquename.sas
34 @li mf_wordsInstr1butnotstr2.sas
35 @li mf_nobs.sas
36
37 <h4> Related Macros </h4>
38 @li mp_getpk.sas
39
40 @version 9.3
41 @author Allan Bowe
42
43**/
44
45%macro mp_guesspk(baseds
46 ,outds=mp_guesspk
47 ,max_guesses=3
48 ,min_rows=5
49 ,ignore_cols=0
50 ,mdebug=0
51)/*/STORE SOURCE*/;
52%local dbg;
53%if &mdebug=1 %then %do;
54 %put &sysmacroname entry vars:;
55 %put _local_;
56%end;
57%else %let dbg=*;
58
59/* declare local vars */
60%local var vars vcnt i j k l tmpvar tmpds rows posspks ppkcnt;
61%let vars=%upcase(%mf_getvarlist(&baseds));
62%let vars=%mf_wordsInStr1ButNotStr2(str1=&vars,str2=%upcase(&ignore_cols));
63%let vcnt=%sysfunc(countw(&vars));
64
65%if &vcnt=0 %then %do;
66 %put &sysmacroname: &baseds has no variables! Exiting.;
67 %return;
68%end;
69
70/* get null count and row count */
71%let tmpvar=%mf_getuniquename();
72proc sql noprint;
73create table _data_ as select
74 count(*) as &tmpvar
75%do i=1 %to &vcnt;
76 %let var=%scan(&vars,&i);
77 ,sum(case when &var is missing then 1 else 0 end) as &var
78%end;
79 from &baseds;
80
81/* transpose table and scan for not null cols */
82proc transpose;
83data _null_;
84 set &syslast end=last;
85 length vars $32767;
86 retain vars ;
87 if _name_="&tmpvar" then call symputx('rows',col1,'l');
88 else if col1=0 then vars=catx(' ',vars,_name_);
89 if last then call symputx('posspks',vars,'l');
90run;
91
92%let ppkcnt=%sysfunc(countw(&posspks));
93%if &ppkcnt=0 %then %do;
94 %put &sysmacroname: &baseds has no non-missing variables! Exiting.;
95 %return;
96%end;
97
98proc sort data=&baseds(keep=&posspks) out=_data_ noduprec;
99 by _all_;
100run;
101%local pkds; %let pkds=&syslast;
102
103%if &rows > %mf_nobs(&pkds) %then %do;
104 %put &sysmacroname: &baseds has no combination of unique records! Exiting.;
105 %return;
106%end;
107
108/* now check cardinality */
109proc sql noprint;
110create table _data_ as select
111%do i=1 %to &ppkcnt;
112 %let var=%scan(&posspks,&i);
113 count(distinct &var) as &var
114 %if &i<&ppkcnt %then ,;
115%end;
116 from &pkds;
117
118/* transpose and sort by cardinality */
119proc transpose;
120proc sort; by descending col1;
121run;
122
123/* create initial PK list and re-order posspks list */
124data &outds(keep=pkguesses);
125 length pkguesses $5000 vars $5000;
126 set &syslast end=last;
127 retain vars ;
128 vars=catx(' ',vars,_name_);
129 if col1=&rows then do;
130 pkguesses=_name_;
131 output;
132 end;
133 if last then call symputx('posspks',vars,'l');
134run;
135
136%if %mf_nobs(&outds) ge &max_guesses %then %do;
137 %put &sysmacroname: %mf_nobs(&outds) possible primary key values found;
138 %return;
139%end;
140
141%if &ppkcnt=1 %then %do;
142 %put &sysmacroname: No more PK guess possible;
143 %return;
144%end;
145
146/* begin scanning for uniques on pairs of PKs */
147%let tmpds=%mf_getuniquename();
148%local lev1 lev2;
149%do i=1 %to &ppkcnt;
150 %let lev1=%scan(&posspks,&i);
151 %do j=2 %to &ppkcnt;
152 %let lev2=%scan(&posspks,&j);
153 %if &lev1 ne &lev2 %then %do;
154 /* check for two level uniqueness */
155 proc sort data=&pkds(keep=&lev1 &lev2) out=&tmpds noduprec;
156 by _all_;
157 run;
158 %if %mf_nobs(&tmpds)=&rows %then %do;
159 proc sql;
160 insert into &outds values("&lev1 &lev2");
161 %if %mf_nobs(&outds) ge &max_guesses %then %do;
162 %put &sysmacroname: Max PKs reached at Level 2 for &baseds;
163 %goto exit;
164 %end;
165 %end;
166 %end;
167 %end;
168%end;
169
170%if &ppkcnt=2 %then %do;
171 %put &sysmacroname: No more PK guess possible;
172 %goto exit;
173%end;
174
175/* begin scanning for uniques on PK triplets */
176%local lev3;
177%do i=1 %to &ppkcnt;
178 %let lev1=%scan(&posspks,&i);
179 %do j=2 %to &ppkcnt;
180 %let lev2=%scan(&posspks,&j);
181 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
182 %let lev3=%scan(&posspks,&k);
183 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do;
184 /* check for three level uniqueness */
185 proc sort data=&pkds(keep=&lev1 &lev2 &lev3) out=&tmpds noduprec;
186 by _all_;
187 run;
188 %if %mf_nobs(&tmpds)=&rows %then %do;
189 proc sql;
190 insert into &outds values("&lev1 &lev2 &lev3");
191 %if %mf_nobs(&outds) ge &max_guesses %then %do;
192 %put &sysmacroname: Max PKs reached at Level 3 for &baseds;
193 %goto exit;
194 %end;
195 %end;
196 %end;
197 %end;
198 %end;
199%end;
200
201%if &ppkcnt=3 %then %do;
202 %put &sysmacroname: No more PK guess possible;
203 %goto exit;
204%end;
205
206/* scan for uniques on up to 4 PK fields */
207%local lev4;
208%do i=1 %to &ppkcnt;
209 %let lev1=%scan(&posspks,&i);
210 %do j=2 %to &ppkcnt;
211 %let lev2=%scan(&posspks,&j);
212 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
213 %let lev3=%scan(&posspks,&k);
214 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
215 %let lev4=%scan(&posspks,&l);
216 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then %do;
217 /* check for four level uniqueness */
218 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4)
219 out=&tmpds noduprec;
220 by _all_;
221 run;
222 %if %mf_nobs(&tmpds)=&rows %then %do;
223 proc sql;
224 insert into &outds values("&lev1 &lev2 &lev3 &lev4");
225 %if %mf_nobs(&outds) ge &max_guesses %then %do;
226 %put &sysmacroname: Max PKs reached at Level 4 for &baseds;
227 %goto exit;
228 %end;
229 %end;
230 %end;
231 %end;
232 %end;
233 %end;
234%end;
235
236%if &ppkcnt=4 %then %do;
237 %put &sysmacroname: No more PK guess possible;
238 %goto exit;
239%end;
240
241/* scan for uniques on up to 4 PK fields */
242%local lev5 m;
243%do i=1 %to &ppkcnt;
244 %let lev1=%scan(&posspks,&i);
245 %do j=2 %to &ppkcnt;
246 %let lev2=%scan(&posspks,&j);
247 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
248 %let lev3=%scan(&posspks,&k);
249 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
250 %let lev4=%scan(&posspks,&l);
251 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
252 %do m=5 %to &ppkcnt;
253 %let lev5=%scan(&posspks,&m);
254 %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5 %then %do;
255 /* check for four level uniqueness */
256 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5)
257 out=&tmpds noduprec;
258 by _all_;
259 run;
260 %if %mf_nobs(&tmpds)=&rows %then %do;
261 proc sql;
262 insert into &outds values("&lev1 &lev2 &lev3 &lev4 &lev5");
263 %if %mf_nobs(&outds) ge &max_guesses %then %do;
264 %put &sysmacroname: Max PKs reached at Level 5 for &baseds;
265 %goto exit;
266 %end;
267 %end;
268 %end;
269 %end;
270 %end;
271 %end;
272 %end;
273%end;
274
275%if &ppkcnt=5 %then %do;
276 %put &sysmacroname: No more PK guess possible;
277 %goto exit;
278%end;
279
280/* scan for uniques on up to 4 PK fields */
281%local lev6 n;
282%do i=1 %to &ppkcnt;
283 %let lev1=%scan(&posspks,&i);
284 %do j=2 %to &ppkcnt;
285 %let lev2=%scan(&posspks,&j);
286 %if &lev1 ne &lev2 %then %do k=3 %to &ppkcnt;
287 %let lev3=%scan(&posspks,&k);
288 %if &lev1 ne &lev3 and &lev2 ne &lev3 %then %do l=4 %to &ppkcnt;
289 %let lev4=%scan(&posspks,&l);
290 %if &lev1 ne &lev4 and &lev2 ne &lev4 and &lev3 ne &lev4 %then
291 %do m=5 %to &ppkcnt;
292 %let lev5=%scan(&posspks,&m);
293 %if &lev1 ne &lev5 & &lev2 ne &lev5 & &lev3 ne &lev5 & &lev4 ne &lev5
294 %then %do n=6 %to &ppkcnt;
295 %let lev6=%scan(&posspks,&n);
296 %if &lev1 ne &lev6 & &lev2 ne &lev6 & &lev3 ne &lev6
297 & &lev4 ne &lev6 & &lev5 ne &lev6 %then
298 %do;
299 /* check for four level uniqueness */
300 proc sort data=&pkds(keep=&lev1 &lev2 &lev3 &lev4 &lev5 &lev6)
301 out=&tmpds noduprec;
302 by _all_;
303 run;
304 %if %mf_nobs(&tmpds)=&rows %then %do;
305 proc sql;
306 insert into &outds
307 values("&lev1 &lev2 &lev3 &lev4 &lev5 &lev6");
308 %if %mf_nobs(&outds) ge &max_guesses %then %do;
309 %put &sysmacroname: Max PKs reached at Level 6 for &baseds;
310 %goto exit;
311 %end;
312 %end;
313 %end;
314 %end;
315 %end;
316 %end;
317 %end;
318 %end;
319%end;
320
321%if &ppkcnt=6 %then %do;
322 %put &sysmacroname: No more PK guess possible;
323 %goto exit;
324%end;
325
326%exit:
327%if &mdebug=0 %then %do;
328 proc sql;
329 drop table &tmpds;
330%end;
331
332%mend mp_guesspk;