PROC DS2 Performance Issues

I tried using proc ds2 to try to increase performance in a regular data step using multithreading.
fred.testdata is an SPDE dataset containing 5 million cases. My code is below:

proc ds2; thread home_claims_thread / overwrite = yes; /*declare char(10) producttype; declare char(12) wrknat_clmtype; declare char(7) claimtypedet; declare char(1) event_flag;*/ /*declare date week_ending having format date9.;*/ method run(); /*declare char(7) _week_ending;*/ set fred.testdata; if claim = 'X' then claimtypedet= 'ABC'; else if claim = 'Y' then claimtypedet= 'DEF'; /*_week_ending = COMPRESS(exposmth,'M'); week_ending = to_date(substr(_week_ending,1,4) || '-' || substr(_week_ending,5,2) || '-01');*/ end; endthread; data home_claims / overwrite = yes; declare thread home_claims_thread t; method run(); set from t threads=8; end; enddata; run; quit; 

I did not include all IF statements and included only a few, otherwise it would take several pages (you should get this idea, I hope). Since this code currently works fairly honestly than a regular data step, however, significant performance problems arise when any of the following events occur:

  • I will uncomment any ad expression
  • I include any numeric variables in fred.testdata (even without doing any calculations on numeric variables)

My questions:

  • Is there a way to introduce numeric variables into fred.testdata without significant slowdowns that make DS2 slower than a regular data step? (for this small table of 5 million rows, including the numeric column / s, the real time is about 1 min 30 for ds2 and 20 seconds for a regular data step). The actual full table is closer to 600 million rows. For example, I would like to be able to do this weekly conversion without introducing it a 5-fold decrease in performance at runtime. Runtime for ds2 WITHOUT statement of operators, and numerical variables take about 7 seconds.
  • Is there a way to compress a table in ds2 without having to perform an extra data step to compress it?

thanks

+5
source share
1 answer

Two ways to try: using proc hpds2 to run SAS in parallel, or a more manual approach. Please note that it is not always possible to maintain order using any of these methods.

Method 1: PROC HPDS2

HPDS2 is a method of massively parallel data processing. In single-engine mode, it will execute parallel runs to the core, and then all the data will be back together. You just need to make a few small changes to your code to run it.

hpds2 has a setting in which you declare your data in the data and out statements in the proc statement. The data and set statements will always use the following syntax:

  data DS2GTF.out; method run(); set DS2GTF.in; <code>; end; enddata; 

Knowing that we can change your code to work on HPDS2:

 proc hpds2 data=fred.test_data out=home_claims; data DS2GTF.out; /*declare char(10) producttype; declare char(12) wrknat_clmtype; declare char(7) claimtypedet; declare char(1) event_flag;*/ /*declare date week_ending having format date9.;*/ method run(); /*declare char(7) _week_ending;*/ set DS2GTF.in; if claim = 'X' then claimtypedet= 'ABC'; else if claim = 'Y' then claimtypedet= 'DEF'; /*_week_ending = COMPRESS(exposmth,'M'); week_ending = to_date(substr(_week_ending,1,4) || '-' || substr(_week_ending,5,2) || '-01');*/ end; enddata; run; quit; 

Method 2: Separate data with rsubmit and append

The code below uses rsubmit and direct access to monitor the reading of the data in chunks, then add them all together at the end. This may work especially well if you have data for Block I / O

 options sascmd='!sascmd' autosignon=yes noconnectwait noconnectpersist ; %let cpucount = %sysfunc(getoption(cpucount)); %macro parallel_execute(data=, out=, threads=&cpucount); /* Get total obs from data */ %let dsid = %sysfunc(open(&data.)); %let n = %sysfunc(attrn(&dsid., nlobs)); %let rc = %sysfunc(close(&dsid.)); /* Run &threads rsubmit sessions */ %do i = 1 %to &threads; /* Determine the records that each worker will read */ %let firstobs = %sysevalf(&n.-(&n./&threads.)*(&threads.-&i+1)+1, floor); %let lastobs = %sysevalf(&n.-(&n./&threads.)*(&threads.-&i.), floor); /* Get this session work directory */ %let workdir = %sysfunc(getoption(work)); /* Send all macro variables to the remote session, and simultaneously start the remote session */ %syslput _USER_ / remote=worker&i.; /* Check for an input libname */ %if(%scan(&data., 2, .) NE) %then %do; %let inlib = %scan(&data., 1, .); %let indsn = %scan(&data., 2, .); %end; %else %do; %let inlib = workdir; %let indsn = &data.; %end; /* Check for an output libname */ %if(%scan(&out., 2, .) NE) %then %do; %let outlib = %scan(&out., 1, .); %let outdsn = %scan(&out., 2, .); %end; %else %do; %let outlib = workdir; %let outdsn = &out.; %end; /* Work library location of this session to be inherited by the parallel session */ %let workdir = %sysfunc(getoption(work)); /* Sign on to a remote session and send over all user-made macro variables */ %syslput _USER_ / remote=worker&i.; /* Run code on remote session &i */ rsubmit remote=worker&i. inheritlib=(&inlib.); libname workdir "&workdir."; data workdir._&outdsn._&i.; set &inlib..&indsn.(firstobs=&firstobs. obs=&lastobs.); /* <PUT CODE HERE>;*/ run; endrsubmit; %end; /* Wait for everything to complete */ waitfor _ALL_; /* Append all of the chunks together */ proc datasets nolist; delete &out.; %do i = 1 %to &threads.; append base=&out. data=_&outdsn._&i. force ; %end; /* Optional: remove all temporary data */ /* delete _&outdsn._:;*/ quit; libname workdir clear; %mend; 

You can check its functionality using the code below:

 data pricedata; set sashelp.pricedata; run; %parallel_execute(data=pricedata, out=test, threads=3); 

If you look at the temporary files in your WORK directory, you will see that it evenly splits the data set among the three parallel processes and summarizes them with the original result.

 _test_1 = 340 _test_2 = 340 _test_3 = 340 TOTAL = 1020 pricedata = 1020 
+3
source

Source: https://habr.com/ru/post/1273697/


All Articles