Macros for SAS Application Developers
https://github.com/sasjs/core
mp_stackdiffs.sas
Go to the documentation of this file.
1/**
2 @file
3 @brief Prepares an audit table for stacking (re-applying) the changes.
4 @details When the underlying data from a Base Table is refreshed, it can be
5 helpful to have any previously-applied changes, re-applied.
6
7 Such situation might arise if you are applying those changes using a tool
8 like [Data Controller for SASĀ®](https://datacontroller.io) - which records
9 all such changes in an audit table.
10 It may also apply if you are preparing a series of specific cell-level
11 transactions, that you would like to apply to multiple sets of (similarly
12 structured) Base Tables.
13
14 In both cases, it is necessary that the transactions are stored using
15 the mp_storediffs.sas macro, or at least that the underlying table is
16 structured as per the definition in mp_coretable.sas (DIFFTABLE entry)
17
18 <b>This</b> macro is used to convert the stored changes (tall format) into
19 staged changes (wide format), with base table values incorporated (in the
20 case of modified rows), ready for the subsequent load process.
21
22 Essentially then, what this macro does, is turn a table like this:
23
24|KEY_HASH:$32.|MOVE_TYPE:$1.|TGTVAR_NM:$32.|IS_PK:best.|IS_DIFF:best.|TGTVAR_TYPE:$1.|OLDVAL_NUM:best32.|NEWVAL_NUM:best32.|OLDVAL_CHAR:$32765.|NEWVAL_CHAR:$32765.|
25|---|---|---|---|---|---|---|---|---|---|
26|`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|` `|`Newbie `|
27|`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`AGE `|`0 `|`-1 `|`N `|`. `|`13 `|` `|` `|
28|`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`HEIGHT `|`0 `|`-1 `|`N `|`. `|`65.3 `|` `|` `|
29|`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|` `|`F `|
30|`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`WEIGHT `|`0 `|`-1 `|`N `|`. `|`98 `|` `|` `|
31|`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|`Alfred `|` `|
32|`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`AGE `|`0 `|`-1 `|`N `|`14 `|`. `|` `|` `|
33|`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`HEIGHT `|`0 `|`-1 `|`N `|`69 `|`. `|` `|` `|
34|`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|`M `|` `|
35|`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`WEIGHT `|`0 `|`-1 `|`N `|`112.5 `|`. `|` `|` `|
36|`64489C85DC2FE0787B85CD87214B3810 `|`M `|`NAME `|`1 `|`0 `|`C `|`. `|`. `|`Alice `|`Alice `|
37|`64489C85DC2FE0787B85CD87214B3810 `|`M `|`AGE `|`0 `|`1 `|`N `|`13 `|`99 `|` `|` `|
38|`64489C85DC2FE0787B85CD87214B3810 `|`M `|`HEIGHT `|`0 `|`0 `|`N `|`56.5 `|`56.5 `|` `|` `|
39|`64489C85DC2FE0787B85CD87214B3810 `|`M `|`SEX `|`0 `|`0 `|`C `|`. `|`. `|`F `|`F `|
40|`64489C85DC2FE0787B85CD87214B3810 `|`M `|`WEIGHT `|`0 `|`0 `|`N `|`84 `|`84 `|` `|` `|
41
42 Into three tables like this:
43
44 <b> `work.outmod`: </b>
45 |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
46 |---|---|---|---|---|
47 |`Alice `|`F `|`99 `|`56.5 `|`84 `|
48
49 <b> `work.outadd`: </b>
50 |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
51 |---|---|---|---|---|
52 |`Newbie `|`F `|`13 `|`65.3 `|`98 `|
53
54 <b> `work.outdel`: </b>
55 |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
56 |---|---|---|---|---|
57 |`Alfred `|`M `|`14 `|`69 `|`112.5 `|
58
59 As you might expect, there are a bunch of extra features and checks.
60
61 The macro supports both SCD2 (TXTEMPORAL) and UPDATE loadtypes. If the
62 base table contains a PROCESSED_DTTM column (or similar), this can be
63 ignored by declaring it in the `processed_dttm_var` parameter.
64
65 The macro is also flexible where columns have been added or removed from
66 the base table UNLESS there is a change to the primary key.
67
68 Changes to the primary key fields are NOT supported, and are likely to cause
69 unexpected results.
70
71 The following pre-flight checks are made:
72
73 @li All primary key columns exist on the base table
74 @li There is no change in variable TYPE for any of the columns
75 @li There is no reduction in variable LENGTH below the max-length of the
76 supplied values
77
78 Rules for stacking changes are as follows:
79
80 <table>
81 <tr>
82 <th>Transaction Type</th><th>Key Behaviour</th><th>Column Behaviour</th>
83 </tr>
84 <tr>
85 <td>Deletes</td>
86 <td>
87 The row is added to `&outDEL.` UNLESS it no longer exists
88 in the base table, in which case it is added to `&errDS.` instead.
89 </td>
90 <td>
91 Deletes are unaffected by the addition or removal of non Primary-Key
92 columns.
93 </td>
94 </tr>
95 <tr>
96 <td>Inserts</td>
97 <td>
98 Previously newly added rows are added to the `outADD` table UNLESS they
99 are present in the Base table.<br>In this case they are added to the
100 `&errDS.` table instead.
101 </td>
102 <td>
103 Inserts are unaffected by the addition of columns in the Base Table
104 (they are padded with blanks). Deleted columns are only a problem if
105 they appear on the previous insert - in which case the record is added
106 to `&errDS.`.
107 </td>
108 </tr>
109 <tr>
110 <td>Updates</td>
111 <td>
112 Previously modified rows are merged with base table values such that
113 only the individual cells that were _previously_ changed are re-applied.
114 Where the row contains cells that were not marked as having changed in
115 the prior transaction, the 'blanks' are filled with base table values in
116 the `outMOD` table.<br>
117 If the row no longer exists on the base table, then the row is added to
118 the `errDS` table instead.
119 </td>
120 <td>
121 Updates are unaffected by the addition of columns in the Base Table -
122 the new cells are simply populated with Base Table values. Deleted
123 columns are only a problem if they relate to a modified cell
124 (`is_diff=1`) - in which case the record is added to `&errDS.`.
125 </td>
126 </tr>
127 </table>
128
129 To illustrate the above with a diagram:
130
131 @dot
132 digraph {
133 rankdir="TB"
134 start[label="Transaction Type?" shape=Mdiamond]
135 del[label="Does Base Row exist?" shape=rectangle]
136 add [label="Does Base Row exist?" shape=rectangle]
137 mod [label="Does Base Row exist?" shape=rectangle]
138 chkmod [label="Do all modified\n(is_diff=1) cells exist?" shape=rectangle]
139 chkadd [label="Do all inserted cells exist?" shape=rectangle]
140 outmod [label="outMOD\nTable" shape=Msquare style=filled]
141 outadd [label="outADD\nTable" shape=Msquare style=filled]
142 outdel [label="outDEL\nTable" shape=Msquare style=filled]
143 outerr [label="ErrDS Table" shape=Msquare fillcolor=Orange style=filled]
144 start -> del [label="Delete"]
145 start -> add [label="Insert"]
146 start -> mod [label="Update"]
147
148 del -> outdel [label="Yes"]
149 del -> outerr [label="No" color="Red" fontcolor="Red"]
150 add -> chkadd [label="No"]
151 add -> outerr [label="Yes" color="Red" fontcolor="Red"]
152 mod -> outerr [label="No" color="Red" fontcolor="Red"]
153 mod -> chkmod [label="Yes"]
154 chkmod -> outerr [label="No" color="Red" fontcolor="Red"]
155 chkmod -> outmod [label="Yes"]
156 chkadd -> outerr [label="No" color="Red" fontcolor="Red"]
157 chkadd -> outadd [label="Yes"]
158
159 }
160 @enddot
161
162 For examples of usage, check out the mp_stackdiffs.test.sas program.
163
164
165 @param [in] baselibds Base Table against which the changes will be applied,
166 in libref.dataset format.
167 @param [in] auditlibds Dataset with previously applied transactions, to be
168 re-applied. Use libref.dataset format.
169 DDL as follows: %mp_coretable(DIFFTABLE)
170 @param [in] key Space seperated list of key variables
171 @param [in] mdebug= Set to 1 to enable DEBUG messages and preserve outputs
172 @param [in] processed_dttm_var= (0) If a variable is being used to mark
173 the processed datetime, put the name of the variable here. It will NOT
174 be included in the staged dataset (the load process is expected to
175 provide this)
176 @param [out] errds= (work.errds) Output table containing problematic records.
177 The columns of this table are:
178 @li PK_VARS - Space separated list of primary key variable names
179 @li PK_VALS - Slash separted list of PK variable values
180 @li ERR_MSG - Explanation of why this record is problematic
181 @param [out] outmod= (work.outmod) Output table containing modified records
182 @param [out] outadd= (work.outadd) Output table containing additional records
183 @param [out] outdel= (work.outdel) Output table containing deleted records
184
185 <h4> SAS Macros </h4>
186 @li mf_existvarlist.sas
187 @li mf_getquotedstr.sas
188 @li mf_getuniquefileref.sas
189 @li mf_getuniquename.sas
190 @li mf_islibds.sas
191 @li mf_nobs.sas
192 @li mf_wordsinstr1butnotstr2.sas
193 @li mp_abort.sas
194 @li mp_ds2squeeze.sas
195
196 <h4> Related Macros </h4>
197 @li mp_coretable.sas
198 @li mp_stackdiffs.test.sas
199 @li mp_storediffs.sas
200
201 @todo The current approach assumes that a variable called KEY_HASH is not on
202 the base table. This part will need to be refactored (eg using
203 mf_getuniquename.sas) when such a use case arises.
204
205 @version 9.2
206 @author Allan Bowe
207**/
208/** @cond */
209
210%macro mp_stackdiffs(baselibds
211 ,auditlibds
212 ,key
213 ,mdebug=0
214 ,processed_dttm_var=0
215 ,errds=work.errds
216 ,outmod=work.outmod
217 ,outadd=work.outadd
218 ,outdel=work.outdel
219)/*/STORE SOURCE*/;
220%local dbg;
221%if &mdebug=1 %then %do;
222 %put &sysmacroname entry vars:;
223 %put _local_;
224%end;
225%else %let dbg=*;
226
227/* input parameter validations */
228%mp_abort(iftrue= (%mf_islibds(&baselibds) ne 1)
229 ,mac=&sysmacroname
230 ,msg=%str(Invalid baselibds: &baselibds)
231)
232%mp_abort(iftrue= (%mf_islibds(&auditlibds) ne 1)
233 ,mac=&sysmacroname
234 ,msg=%str(Invalid auditlibds: &auditlibds)
235)
236%mp_abort(iftrue= (%length(&key)=0)
237 ,mac=&sysmacroname
238 ,msg=%str(Missing key variables!)
239)
240%mp_abort(iftrue= (
241 %mf_existVarList(&auditlibds,LIBREF DSN MOVE_TYPE KEY_HASH TGTVAR_NM IS_PK
242 IS_DIFF TGTVAR_TYPE OLDVAL_NUM NEWVAL_NUM OLDVAL_CHAR NEWVAL_CHAR)=0
243 )
244 ,mac=&sysmacroname
245 ,msg=%str(Input &auditlibds is missing required columns!)
246)
247
248
249/* set up macro vars */
250%local prefix dslist x var keyjoin commakey keepvars missvars fref;
251%let prefix=%substr(%mf_getuniquename(),1,25);
252%let dslist=ds1d ds2d ds3d ds1a ds2a ds3a ds1m ds2m ds3m pks dups base
253 delrec delerr addrec adderr modrec moderr;
254%do x=1 %to %sysfunc(countw(&dslist));
255 %let var=%scan(&dslist,&x);
256 %local &var;
257 %let &var=%upcase(&prefix._&var);
258%end;
259
260%let key=%upcase(&key);
261%let commakey=%mf_getquotedstr(&key,quote=N);
262
263%let keyjoin=1=1;
264%do x=1 %to %sysfunc(countw(&key));
265 %let var=%scan(&key,&x);
266 %let keyjoin=&keyjoin and a.&var=b.&var;
267%end;
268
269data &errds;
270 length pk_vars $256 pk_vals $4098 err_msg $512;
271 call missing (of _all_);
272 stop;
273run;
274
275/**
276 * Prepare raw DELETE table
277 * Records are in the OLDVAL_xxx columns
278 */
279%let keepvars=MOVE_TYPE KEY_HASH TGTVAR_NM TGTVAR_TYPE IS_PK
280 OLDVAL_NUM OLDVAL_CHAR
281 NEWVAL_NUM NEWVAL_CHAR;
282proc sort data=&auditlibds(where=(move_type='D') keep=&keepvars)
283 out=&ds1d(drop=move_type);
284by KEY_HASH TGTVAR_NM;
285run;
286proc transpose data=&ds1d(where=(tgtvar_type='N'))
287 out=&ds2d(drop=_name_);
288 by KEY_HASH;
289 id TGTVAR_NM;
290 var OLDVAL_NUM;
291run;
292proc transpose data=&ds1d(where=(tgtvar_type='C'))
293 out=&ds3d(drop=_name_);
294 by KEY_HASH;
295 id TGTVAR_NM;
296 var OLDVAL_CHAR;
297run;
298%mp_ds2squeeze(&ds2d,outds=&ds2d)
299%mp_ds2squeeze(&ds3d,outds=&ds3d)
300data &outdel;
301 if 0 then set &baselibds;
302 set &ds2d;
303 set &ds3d;
304 drop key_hash;
305 if not missing(%scan(&key,1));
306run;
307proc sort;
308 by &key;
309run;
310
311/**
312 * Prepare raw APPEND table
313 * Records are in the NEWVAL_xxx columns
314 */
315proc sort data=&auditlibds(where=(move_type='A') keep=&keepvars)
316 out=&ds1a(drop=move_type);
317 by KEY_HASH TGTVAR_NM;
318run;
319proc transpose data=&ds1a(where=(tgtvar_type='N'))
320 out=&ds2a(drop=_name_);
321 by KEY_HASH;
322 id TGTVAR_NM;
323 var NEWVAL_NUM;
324run;
325proc transpose data=&ds1a(where=(tgtvar_type='C'))
326 out=&ds3a(drop=_name_);
327 by KEY_HASH;
328 id TGTVAR_NM;
329 var NEWVAL_CHAR;
330run;
331%mp_ds2squeeze(&ds2a,outds=&ds2a)
332%mp_ds2squeeze(&ds3a,outds=&ds3a)
333data &outadd;
334 if 0 then set &baselibds;
335 set &ds2a;
336 set &ds3a;
337 drop key_hash;
338 if not missing(%scan(&key,1));
339run;
340proc sort;
341 by &key;
342run;
343
344/**
345 * Prepare raw MODIFY table
346 * Keep only primary key - will add modified values later
347 */
348proc sort data=&auditlibds(
349 where=(move_type='M' and is_pk=1) keep=&keepvars
350 ) out=&ds1m(drop=move_type);
351 by KEY_HASH TGTVAR_NM;
352run;
353proc transpose data=&ds1m(where=(tgtvar_type='N'))
354 out=&ds2m(drop=_name_);
355 by KEY_HASH ;
356 id TGTVAR_NM;
357 var NEWVAL_NUM;
358run;
359proc transpose data=&ds1m(where=(tgtvar_type='C'))
360 out=&ds3m(drop=_name_);
361 by KEY_HASH;
362 id TGTVAR_NM;
363 var NEWVAL_CHAR;
364run;
365%mp_ds2squeeze(&ds2m,outds=&ds2m)
366%mp_ds2squeeze(&ds3m,outds=&ds3m)
367data &outmod;
368 if 0 then set &baselibds;
369 set &ds2m;
370 set &ds3m;
371 if not missing(%scan(&key,1));
372run;
373proc sort;
374 by &key;
375run;
376
377/**
378 * Extract matching records from the base table
379 * Do this in one join for efficiency.
380 * At a later date, this should be optimised for large database tables by using
381 * passthrough and a temporary table.
382 */
383data &pks;
384 if 0 then set &baselibds;
385 set &outadd &outmod &outdel;
386 keep &key;
387run;
388
389proc sort noduprec dupout=&dups;
390by &key;
391run;
392data _null_;
393 set &dups;
394 putlog (_all_)(=);
395run;
396%mp_abort(iftrue= (%mf_nobs(&dups) ne 0)
397 ,mac=&sysmacroname
398 ,msg=%str(duplicates (%mf_nobs(&dups)) found on &auditlibds!)
399)
400
401proc sql;
402create table &base as
403 select a.*
404 from &baselibds a, &pks b
405 where &keyjoin;
406
407/**
408 * delete check
409 * This is straightforward as it relates to records only
410 */
411proc sql;
412create table &delrec as
413 select a.*
414 from &outdel a
415 left join &base b
416 on &keyjoin
417 where b.%scan(&key,1) is null
418 order by &commakey;
419
420data &delerr;
421 if 0 then set &errds;
422 set &delrec;
423 PK_VARS="&key";
424 PK_VALS=catx('/',&commakey);
425 ERR_MSG="Rows cannot be deleted as they do not exist on the Base dataset";
426 keep PK_VARS PK_VALS ERR_MSG;
427run;
428proc append base=&errds data=&delerr;
429run;
430
431data &outdel;
432 merge &outdel (in=a) &delrec (in=b);
433 by &key;
434 if not b;
435run;
436
437/**
438 * add check
439 * Problems - where record already exists, or base table has columns missing
440 */
441%let missvars=%mf_wordsinstr1butnotstr2(
442 Str1=%upcase(%mf_getvarlist(&outadd)),
443 Str2=%upcase(%mf_getvarlist(&baselibds))
444);
445%if %length(&missvars)>0 %then %do;
446 /* add them to the err table */
447 data &adderr;
448 if 0 then set &errds;
449 set &outadd;
450 PK_VARS="&key";
451 PK_VALS=catx('/',&commakey);
452 ERR_MSG="Rows cannot be added due to missing base vars: &missvars";
453 keep PK_VARS PK_VALS ERR_MSG;
454 run;
455 proc append base=&errds data=&adderr;
456 run;
457 proc sql;
458 delete * from &outadd;
459%end;
460%else %do;
461 proc sql;
462 /* find records that already exist on base table */
463 create table &addrec as
464 select a.*
465 from &outadd a
466 inner join &base b
467 on &keyjoin
468 order by &commakey;
469
470 /* add them to the err table */
471 data &adderr;
472 if 0 then set &errds;
473 set &addrec;
474 PK_VARS="&key";
475 PK_VALS=catx('/',&commakey);
476 ERR_MSG="Rows cannot be added as they already exist on the Base dataset";
477 keep PK_VARS PK_VALS ERR_MSG;
478 run;
479 proc append base=&errds data=&adderr;
480 run;
481
482 /* remove invalid rows from the outadd table */
483 data &outadd;
484 merge &outadd (in=a) &addrec (in=b);
485 by &key;
486 if not b;
487 run;
488%end;
489
490/**
491 * mod check
492 * Problems - where record does not exist or baseds has modified cols missing
493 */
494proc sql noprint;
495select distinct tgtvar_nm into: missvars separated by ' '
496 from &auditlibds
497 where move_type='M' and is_diff=1;
498%let missvars=%mf_wordsinstr1butnotstr2(
499 Str1=&missvars,
500 Str2=%upcase(%mf_getvarlist(&baselibds))
501);
502%if %length(&missvars)>0 %then %do;
503 /* add them to the err table */
504 data &moderr;
505 if 0 then set &errds;
506 set &outmod;
507 PK_VARS="&key";
508 PK_VALS=catx('/',&commakey);
509 ERR_MSG="Rows cannot be modified due to missing base vars: &missvars";
510 keep PK_VARS PK_VALS ERR_MSG;
511 run;
512 proc append base=&errds data=&moderr;
513 run;
514 proc sql;
515 delete * from &outmod;
516%end;
517%else %do;
518 /* now check for records that do not exist (therefore cannot be modified) */
519 proc sql;
520 create table &modrec as
521 select a.*
522 from &outmod a
523 left join &base b
524 on &keyjoin
525 where b.%scan(&key,1) is null
526 order by &commakey;
527 data &moderr;
528 if 0 then set &errds;
529 set &modrec;
530 PK_VARS="&key";
531 PK_VALS=catx('/',&commakey);
532 ERR_MSG="Rows cannot be modified as they do not exist on the Base dataset";
533 keep PK_VARS PK_VALS ERR_MSG;
534 run;
535 proc append base=&errds data=&moderr;
536 run;
537 /* delete the above records from the outmod table */
538 data &outmod;
539 merge &outmod (in=a) &modrec (in=b);
540 by &key;
541 if not b;
542 run;
543 /* now - we can prepare the final MOD table (which is currently PK only) */
544 proc sql undo_policy=none;
545 create table &outmod as
546 select a.key_hash
547 ,b.*
548 from &outmod a
549 inner join &base b
550 on &keyjoin
551 order by &commakey;
552 /* now - to update outmod with modified (is_diff=1) values */
553 %let fref=%mf_getuniquefileref();
554 data _null_;
555 file &fref;
556 set &auditlibds(where=(move_type='M')) end=lastobs;
557 by key_hash;
558 retain comma 'N';
559 if _n_=1 then put 'proc sql;';
560 if first.key_hash then do;
561 comma='N';
562 put "update &outmod set " @@;
563 end;
564 if is_diff=1 then do;
565 if comma='N' then do;
566 put ' '@@;
567 comma='Y';
568 end;
569 else put ' ,'@@;
570 if tgtvar_type='C' then do;
571 length qstr $32767;
572 qstr=quote(trim(NEWVAL_CHAR));
573 put tgtvar_nm '=' qstr;
574 end;
575 else put tgtvar_nm '=' newval_num;
576 if comma=' ' then comma=' ,';
577 end;
578 if last.key_hash then put ' where key_hash=trim("' key_hash '");';
579 if lastobs then put "alter table &outmod drop key_hash;";
580 run;
581 %inc &fref/source2;
582%end;
583
584%if &mdebug=0 %then %do;
585 proc datasets lib=work;
586 delete &prefix:;
587 run;
588 %put &sysmacroname exit vars:;
589 %put _local_;
590%end;
591%mend mp_stackdiffs;
592/** @endcond */