Macros for SAS Application Developers
https://github.com/sasjs/core
mp_stackdiffs.sas
Go to the documentation of this file.
1 /**
2  @file
3  @brief Prepares an audit table for stacking (re-applying) the changes.
4  @details When the underlying data from a Base Table is refreshed, it can be
5  helpful to have any previously-applied changes, re-applied.
6 
7  Such situation might arise if you are applying those changes using a tool
8  like [Data Controller for SASĀ®](https://datacontroller.io) - which records
9  all such changes in an audit table.
10  It may also apply if you are preparing a series of specific cell-level
11  transactions, that you would like to apply to multiple sets of (similarly
12  structured) Base Tables.
13 
14  In both cases, it is necessary that the transactions are stored using
15  the mp_storediffs.sas macro, or at least that the underlying table is
16  structured as per the definition in mp_coretable.sas (DIFFTABLE entry)
17 
18  <b>This</b> macro is used to convert the stored changes (tall format) into
19  staged changes (wide format), with base table values incorporated (in the
20  case of modified rows), ready for the subsequent load process.
21 
22  Essentially then, what this macro does, is turn a table like this:
23 
24 |KEY_HASH:$32.|MOVE_TYPE:$1.|TGTVAR_NM:$32.|IS_PK:best.|IS_DIFF:best.|TGTVAR_TYPE:$1.|OLDVAL_NUM:best32.|NEWVAL_NUM:best32.|OLDVAL_CHAR:$32765.|NEWVAL_CHAR:$32765.|
25 |---|---|---|---|---|---|---|---|---|---|
26 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|` `|`Newbie `|
27 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`AGE `|`0 `|`-1 `|`N `|`. `|`13 `|` `|` `|
28 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`HEIGHT `|`0 `|`-1 `|`N `|`. `|`65.3 `|` `|` `|
29 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|` `|`F `|
30 |`27AA6F7581052E7FF48E1BCA901313FB `|`A `|`WEIGHT `|`0 `|`-1 `|`N `|`. `|`98 `|` `|` `|
31 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`NAME `|`1 `|`-1 `|`C `|`. `|`. `|`Alfred `|` `|
32 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`AGE `|`0 `|`-1 `|`N `|`14 `|`. `|` `|` `|
33 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`HEIGHT `|`0 `|`-1 `|`N `|`69 `|`. `|` `|` `|
34 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`SEX `|`0 `|`-1 `|`C `|`. `|`. `|`M `|` `|
35 |`86703FDE9E87DD5C0F8E1072545D0128 `|`D `|`WEIGHT `|`0 `|`-1 `|`N `|`112.5 `|`. `|` `|` `|
36 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`NAME `|`1 `|`0 `|`C `|`. `|`. `|`Alice `|`Alice `|
37 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`AGE `|`0 `|`1 `|`N `|`13 `|`99 `|` `|` `|
38 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`HEIGHT `|`0 `|`0 `|`N `|`56.5 `|`56.5 `|` `|` `|
39 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`SEX `|`0 `|`0 `|`C `|`. `|`. `|`F `|`F `|
40 |`64489C85DC2FE0787B85CD87214B3810 `|`M `|`WEIGHT `|`0 `|`0 `|`N `|`84 `|`84 `|` `|` `|
41 
42  Into three tables like this:
43 
44  <b> `work.outmod`: </b>
45  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
46  |---|---|---|---|---|
47  |`Alice `|`F `|`99 `|`56.5 `|`84 `|
48 
49  <b> `work.outadd`: </b>
50  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
51  |---|---|---|---|---|
52  |`Newbie `|`F `|`13 `|`65.3 `|`98 `|
53 
54  <b> `work.outdel`: </b>
55  |NAME:$8.|SEX:$1.|AGE:best.|HEIGHT:best.|WEIGHT:best.|
56  |---|---|---|---|---|
57  |`Alfred `|`M `|`14 `|`69 `|`112.5 `|
58 
59  As you might expect, there are a bunch of extra features and checks.
60 
61  The macro supports both SCD2 (TXTEMPORAL) and UPDATE loadtypes. If the
62  base table contains a PROCESSED_DTTM column (or similar), this can be
63  ignored by declaring it in the `processed_dttm_var` parameter.
64 
65  The macro is also flexible where columns have been added or removed from
66  the base table UNLESS there is a change to the primary key.
67 
68  Changes to the primary key fields are NOT supported, and are likely to cause
69  unexpected results.
70 
71  The following pre-flight checks are made:
72 
73  @li All primary key columns exist on the base table
74  @li There is no change in variable TYPE for any of the columns
75  @li There is no reduction in variable LENGTH below the max-length of the
76  supplied values
77 
78  Rules for stacking changes are as follows:
79 
80  <table>
81  <tr>
82  <th>Transaction Type</th><th>Key Behaviour</th><th>Column Behaviour</th>
83  </tr>
84  <tr>
85  <td>Deletes</td>
86  <td>
87  The row is added to `&outDEL.` UNLESS it no longer exists
88  in the base table, in which case it is added to `&errDS.` instead.
89  </td>
90  <td>
91  Deletes are unaffected by the addition or removal of non Primary-Key
92  columns.
93  </td>
94  </tr>
95  <tr>
96  <td>Inserts</td>
97  <td>
98  Previously newly added rows are added to the `outADD` table UNLESS they
99  are present in the Base table.<br>In this case they are added to the
100  `&errDS.` table instead.
101  </td>
102  <td>
103  Inserts are unaffected by the addition of columns in the Base Table
104  (they are padded with blanks). Deleted columns are only a problem if
105  they appear on the previous insert - in which case the record is added
106  to `&errDS.`.
107  </td>
108  </tr>
109  <tr>
110  <td>Updates</td>
111  <td>
112  Previously modified rows are merged with base table values such that
113  only the individual cells that were _previously_ changed are re-applied.
114  Where the row contains cells that were not marked as having changed in
115  the prior transaction, the 'blanks' are filled with base table values in
116  the `outMOD` table.<br>
117  If the row no longer exists on the base table, then the row is added to
118  the `errDS` table instead.
119  </td>
120  <td>
121  Updates are unaffected by the addition of columns in the Base Table -
122  the new cells are simply populated with Base Table values. Deleted
123  columns are only a problem if they relate to a modified cell
124  (`is_diff=1`) - in which case the record is added to `&errDS.`.
125  </td>
126  </tr>
127  </table>
128 
129  To illustrate the above with a diagram:
130 
131  @dot
132  digraph {
133  rankdir="TB"
134  start[label="Transaction Type?" shape=Mdiamond]
135  del[label="Does Base Row exist?" shape=rectangle]
136  add [label="Does Base Row exist?" shape=rectangle]
137  mod [label="Does Base Row exist?" shape=rectangle]
138  chkmod [label="Do all modified\n(is_diff=1) cells exist?" shape=rectangle]
139  chkadd [label="Do all inserted cells exist?" shape=rectangle]
140  outmod [label="outMOD\nTable" shape=Msquare style=filled]
141  outadd [label="outADD\nTable" shape=Msquare style=filled]
142  outdel [label="outDEL\nTable" shape=Msquare style=filled]
143  outerr [label="ErrDS Table" shape=Msquare fillcolor=Orange style=filled]
144  start -> del [label="Delete"]
145  start -> add [label="Insert"]
146  start -> mod [label="Update"]
147 
148  del -> outdel [label="Yes"]
149  del -> outerr [label="No" color="Red" fontcolor="Red"]
150  add -> chkadd [label="No"]
151  add -> outerr [label="Yes" color="Red" fontcolor="Red"]
152  mod -> outerr [label="No" color="Red" fontcolor="Red"]
153  mod -> chkmod [label="Yes"]
154  chkmod -> outerr [label="No" color="Red" fontcolor="Red"]
155  chkmod -> outmod [label="Yes"]
156  chkadd -> outerr [label="No" color="Red" fontcolor="Red"]
157  chkadd -> outadd [label="Yes"]
158 
159  }
160  @enddot
161 
162  For examples of usage, check out the mp_stackdiffs.test.sas program.
163 
164 
165  @param [in] baselibds Base Table against which the changes will be applied,
166  in libref.dataset format.
167  @param [in] auditlibds Dataset with previously applied transactions, to be
168  re-applied. Use libref.dataset format.
169  DDL as follows: %mp_coretable(DIFFTABLE)
170  @param [in] key Space seperated list of key variables
171  @param [in] mdebug= Set to 1 to enable DEBUG messages and preserve outputs
172  @param [in] processed_dttm_var= (0) If a variable is being used to mark
173  the processed datetime, put the name of the variable here. It will NOT
174  be included in the staged dataset (the load process is expected to
175  provide this)
176  @param [out] errds= (work.errds) Output table containing problematic records.
177  The columns of this table are:
178  @li PK_VARS - Space separated list of primary key variable names
179  @li PK_VALS - Slash separted list of PK variable values
180  @li ERR_MSG - Explanation of why this record is problematic
181  @param [out] outmod= (work.outmod) Output table containing modified records
182  @param [out] outadd= (work.outadd) Output table containing additional records
183  @param [out] outdel= (work.outdel) Output table containing deleted records
184 
185  <h4> SAS Macros </h4>
186  @li mf_existvarlist.sas
187  @li mf_getquotedstr.sas
188  @li mf_getuniquefileref.sas
189  @li mf_getuniquename.sas
190  @li mf_islibds.sas
191  @li mf_nobs.sas
192  @li mf_wordsinstr1butnotstr2.sas
193  @li mp_abort.sas
194  @li mp_ds2squeeze.sas
195 
196  <h4> Related Macros </h4>
197  @li mp_coretable.sas
198  @li mp_stackdiffs.test.sas
199  @li mp_storediffs.sas
200 
201  @todo The current approach assumes that a variable called KEY_HASH is not on
202  the base table. This part will need to be refactored (eg using
203  mf_getuniquename.sas) when such a use case arises.
204 
205  @version 9.2
206  @author Allan Bowe
207 **/
208 /** @cond */
209 
210 %macro mp_stackdiffs(baselibds
211  ,auditlibds
212  ,key
213  ,mdebug=0
214  ,processed_dttm_var=0
215  ,errds=work.errds
216  ,outmod=work.outmod
217  ,outadd=work.outadd
218  ,outdel=work.outdel
219 )/*/STORE SOURCE*/;
220 %local dbg;
221 %if &mdebug=1 %then %do;
222  %put &sysmacroname entry vars:;
223  %put _local_;
224 %end;
225 %else %let dbg=*;
226 
227 /* input parameter validations */
228 %mp_abort(iftrue= (%mf_islibds(&baselibds) ne 1)
229  ,mac=&sysmacroname
230  ,msg=%str(Invalid baselibds: &baselibds)
231 )
232 %mp_abort(iftrue= (%mf_islibds(&auditlibds) ne 1)
233  ,mac=&sysmacroname
234  ,msg=%str(Invalid auditlibds: &auditlibds)
235 )
236 %mp_abort(iftrue= (%length(&key)=0)
237  ,mac=&sysmacroname
238  ,msg=%str(Missing key variables!)
239 )
240 %mp_abort(iftrue= (
241  %mf_existVarList(&auditlibds,LIBREF DSN MOVE_TYPE KEY_HASH TGTVAR_NM IS_PK
242  IS_DIFF TGTVAR_TYPE OLDVAL_NUM NEWVAL_NUM OLDVAL_CHAR NEWVAL_CHAR)=0
243  )
244  ,mac=&sysmacroname
245  ,msg=%str(Input &auditlibds is missing required columns!)
246 )
247 
248 
249 /* set up macro vars */
250 %local prefix dslist x var keyjoin commakey keepvars missvars fref;
251 %let prefix=%substr(%mf_getuniquename(),1,25);
252 %let dslist=ds1d ds2d ds3d ds1a ds2a ds3a ds1m ds2m ds3m pks dups base
253  delrec delerr addrec adderr modrec moderr;
254 %do x=1 %to %sysfunc(countw(&dslist));
255  %let var=%scan(&dslist,&x);
256  %local &var;
257  %let &var=%upcase(&prefix._&var);
258 %end;
259 
260 %let key=%upcase(&key);
261 %let commakey=%mf_getquotedstr(&key,quote=N);
262 
263 %let keyjoin=1=1;
264 %do x=1 %to %sysfunc(countw(&key));
265  %let var=%scan(&key,&x);
266  %let keyjoin=&keyjoin and a.&var=b.&var;
267 %end;
268 
269 data &errds;
270  length pk_vars $256 pk_vals $4098 err_msg $512;
271  call missing (of _all_);
272  stop;
273 run;
274 
275 /**
276  * Prepare raw DELETE table
277  * Records are in the OLDVAL_xxx columns
278  */
279 %let keepvars=MOVE_TYPE KEY_HASH TGTVAR_NM TGTVAR_TYPE IS_PK
280  OLDVAL_NUM OLDVAL_CHAR
281  NEWVAL_NUM NEWVAL_CHAR;
282 proc sort data=&auditlibds(where=(move_type='D') keep=&keepvars)
283  out=&ds1d(drop=move_type);
284 by KEY_HASH TGTVAR_NM;
285 run;
286 proc transpose data=&ds1d(where=(tgtvar_type='N'))
287  out=&ds2d(drop=_name_);
288  by KEY_HASH;
289  id TGTVAR_NM;
290  var OLDVAL_NUM;
291 run;
292 proc transpose data=&ds1d(where=(tgtvar_type='C'))
293  out=&ds3d(drop=_name_);
294  by KEY_HASH;
295  id TGTVAR_NM;
296  var OLDVAL_CHAR;
297 run;
298 %mp_ds2squeeze(&ds2d,outds=&ds2d)
299 %mp_ds2squeeze(&ds3d,outds=&ds3d)
300 data &outdel;
301  if 0 then set &baselibds;
302  set &ds2d;
303  set &ds3d;
304  drop key_hash;
305  if not missing(%scan(&key,1));
306 run;
307 proc sort;
308  by &key;
309 run;
310 
311 /**
312  * Prepare raw APPEND table
313  * Records are in the NEWVAL_xxx columns
314  */
315 proc sort data=&auditlibds(where=(move_type='A') keep=&keepvars)
316  out=&ds1a(drop=move_type);
317  by KEY_HASH TGTVAR_NM;
318 run;
319 proc transpose data=&ds1a(where=(tgtvar_type='N'))
320  out=&ds2a(drop=_name_);
321  by KEY_HASH;
322  id TGTVAR_NM;
323  var NEWVAL_NUM;
324 run;
325 proc transpose data=&ds1a(where=(tgtvar_type='C'))
326  out=&ds3a(drop=_name_);
327  by KEY_HASH;
328  id TGTVAR_NM;
329  var NEWVAL_CHAR;
330 run;
331 %mp_ds2squeeze(&ds2a,outds=&ds2a)
332 %mp_ds2squeeze(&ds3a,outds=&ds3a)
333 data &outadd;
334  if 0 then set &baselibds;
335  set &ds2a;
336  set &ds3a;
337  drop key_hash;
338  if not missing(%scan(&key,1));
339 run;
340 proc sort;
341  by &key;
342 run;
343 
344 /**
345  * Prepare raw MODIFY table
346  * Keep only primary key - will add modified values later
347  */
348 proc sort data=&auditlibds(
349  where=(move_type='M' and is_pk=1) keep=&keepvars
350  ) out=&ds1m(drop=move_type);
351  by KEY_HASH TGTVAR_NM;
352 run;
353 proc transpose data=&ds1m(where=(tgtvar_type='N'))
354  out=&ds2m(drop=_name_);
355  by KEY_HASH ;
356  id TGTVAR_NM;
357  var NEWVAL_NUM;
358 run;
359 proc transpose data=&ds1m(where=(tgtvar_type='C'))
360  out=&ds3m(drop=_name_);
361  by KEY_HASH;
362  id TGTVAR_NM;
363  var NEWVAL_CHAR;
364 run;
365 %mp_ds2squeeze(&ds2m,outds=&ds2m)
366 %mp_ds2squeeze(&ds3m,outds=&ds3m)
367 data &outmod;
368  if 0 then set &baselibds;
369  set &ds2m;
370  set &ds3m;
371  if not missing(%scan(&key,1));
372 run;
373 proc sort;
374  by &key;
375 run;
376 
377 /**
378  * Extract matching records from the base table
379  * Do this in one join for efficiency.
380  * At a later date, this should be optimised for large database tables by using
381  * passthrough and a temporary table.
382  */
383 data &pks;
384  if 0 then set &baselibds;
385  set &outadd &outmod &outdel;
386  keep &key;
387 run;
388 
389 proc sort noduprec dupout=&dups;
390 by &key;
391 run;
392 data _null_;
393  set &dups;
394  putlog (_all_)(=);
395 run;
396 %mp_abort(iftrue= (%mf_nobs(&dups) ne 0)
397  ,mac=&sysmacroname
398  ,msg=%str(duplicates (%mf_nobs(&dups)) found on &auditlibds!)
399 )
400 
401 proc sql;
402 create table &base as
403  select a.*
404  from &baselibds a, &pks b
405  where &keyjoin;
406 
407 /**
408  * delete check
409  * This is straightforward as it relates to records only
410  */
411 proc sql;
412 create table &delrec as
413  select a.*
414  from &outdel a
415  left join &base b
416  on &keyjoin
417  where b.%scan(&key,1) is null
418  order by &commakey;
419 
420 data &delerr;
421  if 0 then set &errds;
422  set &delrec;
423  PK_VARS="&key";
424  PK_VALS=catx('/',&commakey);
425  ERR_MSG="Rows cannot be deleted as they do not exist on the Base dataset";
426  keep PK_VARS PK_VALS ERR_MSG;
427 run;
428 proc append base=&errds data=&delerr;
429 run;
430 
431 data &outdel;
432  merge &outdel (in=a) &delrec (in=b);
433  by &key;
434  if not b;
435 run;
436 
437 /**
438  * add check
439  * Problems - where record already exists, or base table has columns missing
440  */
441 %let missvars=%mf_wordsinstr1butnotstr2(
442  Str1=%upcase(%mf_getvarlist(&outadd)),
443  Str2=%upcase(%mf_getvarlist(&baselibds))
444 );
445 %if %length(&missvars)>0 %then %do;
446  /* add them to the err table */
447  data &adderr;
448  if 0 then set &errds;
449  set &outadd;
450  PK_VARS="&key";
451  PK_VALS=catx('/',&commakey);
452  ERR_MSG="Rows cannot be added due to missing base vars: &missvars";
453  keep PK_VARS PK_VALS ERR_MSG;
454  run;
455  proc append base=&errds data=&adderr;
456  run;
457  proc sql;
458  delete * from &outadd;
459 %end;
460 %else %do;
461  proc sql;
462  /* find records that already exist on base table */
463  create table &addrec as
464  select a.*
465  from &outadd a
466  inner join &base b
467  on &keyjoin
468  order by &commakey;
469 
470  /* add them to the err table */
471  data &adderr;
472  if 0 then set &errds;
473  set &addrec;
474  PK_VARS="&key";
475  PK_VALS=catx('/',&commakey);
476  ERR_MSG="Rows cannot be added as they already exist on the Base dataset";
477  keep PK_VARS PK_VALS ERR_MSG;
478  run;
479  proc append base=&errds data=&adderr;
480  run;
481 
482  /* remove invalid rows from the outadd table */
483  data &outadd;
484  merge &outadd (in=a) &addrec (in=b);
485  by &key;
486  if not b;
487  run;
488 %end;
489 
490 /**
491  * mod check
492  * Problems - where record does not exist or baseds has modified cols missing
493  */
494 proc sql noprint;
495 select distinct tgtvar_nm into: missvars separated by ' '
496  from &auditlibds
497  where move_type='M' and is_diff=1;
498 %let missvars=%mf_wordsinstr1butnotstr2(
499  Str1=&missvars,
500  Str2=%upcase(%mf_getvarlist(&baselibds))
501 );
502 %if %length(&missvars)>0 %then %do;
503  /* add them to the err table */
504  data &moderr;
505  if 0 then set &errds;
506  set &outmod;
507  PK_VARS="&key";
508  PK_VALS=catx('/',&commakey);
509  ERR_MSG="Rows cannot be modified due to missing base vars: &missvars";
510  keep PK_VARS PK_VALS ERR_MSG;
511  run;
512  proc append base=&errds data=&moderr;
513  run;
514  proc sql;
515  delete * from &outmod;
516 %end;
517 %else %do;
518  /* now check for records that do not exist (therefore cannot be modified) */
519  proc sql;
520  create table &modrec as
521  select a.*
522  from &outmod a
523  left join &base b
524  on &keyjoin
525  where b.%scan(&key,1) is null
526  order by &commakey;
527  data &moderr;
528  if 0 then set &errds;
529  set &modrec;
530  PK_VARS="&key";
531  PK_VALS=catx('/',&commakey);
532  ERR_MSG="Rows cannot be modified as they do not exist on the Base dataset";
533  keep PK_VARS PK_VALS ERR_MSG;
534  run;
535  proc append base=&errds data=&moderr;
536  run;
537  /* delete the above records from the outmod table */
538  data &outmod;
539  merge &outmod (in=a) &modrec (in=b);
540  by &key;
541  if not b;
542  run;
543  /* now - we can prepare the final MOD table (which is currently PK only) */
544  proc sql undo_policy=none;
545  create table &outmod as
546  select a.key_hash
547  ,b.*
548  from &outmod a
549  inner join &base b
550  on &keyjoin
551  order by &commakey;
552  /* now - to update outmod with modified (is_diff=1) values */
553  %let fref=%mf_getuniquefileref();
554  data _null_;
555  file &fref;
556  set &auditlibds(where=(move_type='M')) end=lastobs;
557  by key_hash;
558  retain comma 'N';
559  if _n_=1 then put 'proc sql;';
560  if first.key_hash then do;
561  comma='N';
562  put "update &outmod set " @@;
563  end;
564  if is_diff=1 then do;
565  if comma='N' then do;
566  put ' '@@;
567  comma='Y';
568  end;
569  else put ' ,'@@;
570  if tgtvar_type='C' then do;
571  length qstr $32767;
572  qstr=quote(trim(NEWVAL_CHAR));
573  put tgtvar_nm '=' qstr;
574  end;
575  else put tgtvar_nm '=' newval_num;
576  if comma=' ' then comma=' ,';
577  end;
578  if last.key_hash then put ' where key_hash=trim("' key_hash '");';
579  if lastobs then put "alter table &outmod drop key_hash;";
580  run;
581  %inc &fref/source2;
582 %end;
583 
584 %if &mdebug=0 %then %do;
585  proc datasets lib=work;
586  delete &prefix:;
587  run;
588  %put &sysmacroname exit vars:;
589  %put _local_;
590 %end;
591 %mend mp_stackdiffs;
592 /** @endcond */