Tuesday, May 31, 2011

A scorecard for probability of default with sparkline


In the 1st chapter of their must-read credit risk modeling book, Gunter and Peter used the ratios of working capital(WC), retained earnings(RE), earnings before interest and taxes(EBIT), sales(S) and market value of equity(ME) over either total liabilities (TL) or total assets(TA), to build a logit default risk model for 4000 records by 791 firms through 10 years [Ref.1]. The authors implemented VBA’s user-defined functions in Excel to realize the modeling and scoring procedures. In SAS, Proc Logistic does the same job.

Macros in SAS and Excel can be used together. Excel2010’ new Sparkline functions visualize the fluctuations of values in many rows. For those firms which eventually didn’t default in Gunter and Peter’s example, Sparkline can help observe the changing pattern of default risk for specific firms, such as which year the firms’ maximum default risk occur. Collin and Eli disclosed some definitions to translate a SAS dataset to an Excel table [Ref. 2]. Thus, by using their method, a SAS macro that generates VBA code will automate this process for a default risk scorecard with Sparkline.

References:
1. Gunter Löeffler and Peter Posch. ‘Credit Risk Modeling using Excel and VBA’. The 2nd edition. Wiley.
2. Collin Elliot and Eli Morris. ‘Excel lent SAS Formulas: The Creation and Export of Excel Formulas Using SAS’ . SAS Global 2009.

/*******************READ ME*********************************************
* - A scorecard for probability of default(PD) with sparkline -
*
* SAS VERSION: 9.1.3
* EXCEL VERSION: 2010
* DATE: 31may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

%macro splscd(data = , path = , filename = );
/*****************************************************************
* MACRO: splscd()
* GOAL: create xls and vba for a scorecard with sparkline
* PARAMETERS: data = dataset for modeling and scoring
* path = output path
* filename = name for scorecard
*****************************************************************/
options mprint mlogic;
data _excelCol(where = (start le 256));
length label $2;
do c1 = -1 to 25;
do c2 = 0 to 25;
alpha1 = byte(rank('A')+ c1);
alpha2 = byte(rank('A')+ c2);
label = compress(alpha1||alpha2, " @");
start + 1;
fmtName = "column";
output;
end;
end;
run;

proc format cntlin = _excelCol;
run;

proc logistic data = &data;
model default = WCoverTA REoverTA EBIToverTA MEoverTL SoverTA;
score data = &data out = _scored;
run;

data _tmp01;
set _scored;
where default = 0;
keep id year p_1;
run;

proc sort data = _tmp01;
by id year;
run;

proc transpose data = _tmp01 out = _tmp02(keep = id year:) prefix = year;
by id;
id year;
var p_1;
run;

ods listing close;
ods output variables = _vartab;
proc contents data = _tmp02;
run;

proc sql;
select variable into: varlist separated by ' '
from _vartab
order by substr(variable, 5, 4)
;
select count(*) format = column. into: col_num
from _vartab
;
select count(*)+1 format = column. into: spl_col
from _vartab
;
quit;

data _tmp03;
retain &varlist;
set _tmp02 nobs = nobs;
sparkline =.;
call symput('nobs_plus', nobs + 1);
run;

ods html file = "&path\&filename..xls" style = minimal;
option missing = '';
title; footnote;
proc print data = _tmp03 noobs;
run;
ods html close;

proc sql;
create table _tmp04 (string char(200));
insert into _tmp04
values("Sub sas2vba()")
values("''''''''CREATE SPARKLINE'''''''''")
values('Columns("spl_col:spl_col").ColumnWidth=30')
values("Dim mySG As SparklineGroup")
values('Set mySG = _ ')
values('Range("$spl_col$2:$spl_col$nobs_plus").SparklineGroups.Add(Type:=xlSparkColumn, SourceData:="B2:col_numnobs_plus")')
values('mySG.SeriesColor.ThemeColor=6')
values("mySG.Points.Highpoint.Visible=True")
values("''''''''FORMAT THE TABLE'''''''''")
values('ActiveSheet.ListObjects.Add(xlSrcRange,Range("$A$1:$spl_col$nobs_plus"), , xlYes).Name="myTab"')
values('Range("myTab[#All]").Select')
values('ActiveSheet.ListObjects("myTab").TableStyle="TableStyleMedium1"')
values("''''''''SAVE AS EXCEL2010 FORMAT'''''''''")
values('ChDir "sas_path"')
values('ActiveWorkbook.SaveAs Filename:="sas_path\excel_file.xlsx",FileFormat:=xlOpenXMLWorkbook,CreateBackup:=False')
values("End Sub")
;
quit;

data _tmp05;
set _tmp04;
string = tranwrd(string, "spl_col", "&spl_col");
string = tranwrd(string, "nobs_plus", "&nobs_plus");
string = tranwrd(string, "col_num", "&col_num");
string = tranwrd(string, "sas_path", "&path");
string = tranwrd(string, "excel_file", "&filename");
if _n_ in (3, 6, 10) then string = compress(string, ' ');
run;

data _null_;
set _tmp05;
file "&path\&filename..bas";
put string;
run;

proc datasets nolist;
delete _:;
quit;
%mend splscd;

%splscd(data = test1, path = c:\tmp, filename = scorecard);

Sunday, May 29, 2011

Monte Carlo and quasi-Monte Carlo simulations for credit risk

In SAS, we can do quasi-Monte Carlo simulation by Halton’s low-discrepancy sequences, which are better than random variables from uniform distribution . The only procedure in SAS 9.2 to produce Halton’s sequence is probably the MDC procedure in SAS/ETS [Ref. 2] but hard to output. However, we can build a user-defined function by PROC FCMP to realize our goals. I showed an example by modifying the codes from Paolo Brandimarte’s Matlab book [Ref. 1] to estimate the area of an arbitrary surface. Overall, Halton’s method would converge with more sampling points, while uniform randomization will cause estimates fluctuating along the real value.

/*******************READ ME*********************************************
* - QUASI-MONTE CARLO SIMULATION BY FUNCTIONAL PROGRAMMING -
*
* VERSION: SAS 9.2(ts2m0), windows 64bit
* DATE: 02apr2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME*****************************************/

****************(1) MODULE-BUILDING STEP******************;
******(1.1) COMPILE USER-DEFINED FUNCTIONS*************;
proc fcmp outlib = sasuser.qmc.funcs;
/***********************************************************
* FUNCTION: h = halton(n, b)
* INPUT: n = the nth number | b = the base number
* OUTPUT: h = the halton's low-discrepancy number
***********************************************************/
function halton(n, b);
n0 = n;
h = 0;
f = 1 / b;
do while (n0 > 0);
n1 = floor(n0 / b);
r = n0 - n1*b;
h = h + f*r;
f = f / b;
n0 = n1;
end;
return(h);
endsub;

/***********************************************************
* FUNCTION: z = surface1(x, y)
* INPUT: x, y = independent variables
* OUTPUT: z = response variable
***********************************************************/
function surface1(x, y);
pi = constant("PI");
z = exp((-x)*y) * (sin(6*pi*x) + cos(8*pi*y));
return(z);
endsub;
run;

******(1.2) BUILD A 3D PLOTTING MACRO*************;
%macro plot3d(ds = , x = , y = , z = , width = , height = , zangle = );
ods html style = money;
ods graphics / width = &width.px height = &height.px imagefmt = png;
proc template;
define statgraph surfaceplotparm;
begingraph;
layout overlay3d / cube = true rotate = &zangle;
surfaceplotparm x = &x y = &y z = &z
/ surfacetype = fill surfacecolorgradient = &z;
endlayout;
endgraph;
end;
run;

proc sgrender data = &ds
template = surfaceplotparm;
run;
ods graphics off;
ods html close;
%mend;

****************END OF STEP (1)******************;

****************(2) PROGRAMMING STEP**********************;
******(2.1) IMPORT USER-DEFINED FUNCTIONS*************;
option cmplib = (sasuser.qmc);

******(2.2) DISTRIBUTIONS BETWEEN HALTON AND UNIFORM RANDOM NUMBERS*;
data test;
do n = 1 to 100;
do b = 2, 7;
halton_random_number = halton(n, b);
uniform_random_number = ranuni(20110401);
output;
end;
end;
run;

proc transpose data = test out = test1;
by n;
var halton_random_number uniform_random_number;
id b;
run;

******(2.3) GENERATE DATA FOR PLOTTING FUNCTION'S SURFACE****;
data surface;
do x = 0 to 1 by 0.01;
do y = 0 to 1 by 0.01;
z = surface1(x, y);
output;
end;
end;
run;

******(2.4) CONDUCT QUASI-MONTE CARLO SIMULATION***;
data simuds;
do i = 1 to 50;
do n = 1 to 100*i;
do b = 2, 7;
halton_random_number = halton(n, b);
uniform_random_number = ranuni(20110401);
output;
end;
end;
end;
run;

proc transpose data = simuds out = simuds_t;
by i n;
id b;
var halton_random_number uniform_random_number;
run;

data simuds1;
set simuds_t;
x = surface1(_2, _7);
run;

proc sql;
create table simuds2 as
select i, _name_, mean(x) as area label = 'Area by simulation'
from simuds1
group by i, _name_
;quit;

****************END OF STEP (2)******************;

****************(3) VISUALIZATION STEP*******************************;
******(3.1) PLOT DATA BY STEP 2.2*************;
ods html style = money;
proc sgscatter data = test1;
plot _2*_7 / grid group = _name_;
label _2 = 'Sequences with 2 as base'
_7 = 'Sequences with 7 as base'
_name_ = 'Methods';
run;
ods html close;

******(3.2) PLOT DATA BY STEP 2.3*************;
%plot3d(ds = surface, x = x , y = y, z = z, width = 800, height = 800, zangle = 60)

******(3.3) PLOT DATA BY STEP 2.4*************;
ods html style = ocean;
proc sgplot data = simuds2;
series x = i y = area / group = _name_ ;
refline 0.0199 / axis = y label = ('Real area');
xaxis label = 'Random numbers --- ×100';
label _name_ = 'Methods';
run;
ods html close;

****************END OF STEP (3)******************;

****************END OF ALL CODING***************************************;


First, we can use Monte Carlo simulation to evaluate the risk of a portfolio of credit instruments. As for a risk manager, the common question is to answer the probability of the losses for a portfolio over certain time period. To conduct a Monte Carlo simulation, four variables, such as Probability of default(PD), Loss given default(LGD), Exposure at default(EAD), Weight(correlations among individual credit events), are required. Here I simulated a portfolio with 100 obligors. Then I ran a 10000- loop for MC and calculated the loss at given percentiles. The histogram of the portfolio loss is displayed above.

data raw;
format pd lgd percent8.;
w = 0.3;
do i = 1 to 50;
PD = 0.01;
LGD= 0.5;
EAD= 100;
output;
end;
do i = 51 to 100;
PD = 0.05;
LGD= 0.5;
EAD= 100;
output;
end;
label pd = 'Probability of default'
lgd = 'Loss given default'
ead = 'Exposure at default'
w = 'Weight';
run;

libname mem 'c:\tmp' memlib;
data mem.raw;
set raw;
run;

%macro mc(cycle = );
filename junk dummy;
proc printto log=junk; run;
%do i = 1 %to &cycle;
%if %eval(&i) = 1 %then %do;
proc sql;
create table mem.result (sumloss num);
quit;
%end;
data mem._tmp01;
set mem.raw;
retain z;
if _n_ = 1 then z = rannor(0);
d = probit(pd);
a = w*z + (1-w**2)**0.5*rannor(0);
if a < d then loss = lgd*ead;
else loss = 0;
run;
proc sql;
insert into mem.result
select sum(loss) from mem._tmp01;
quit;
%end;
proc univariate data = mem.result noprint;
output out = mem.pct pctlpts = 90 95 99 99.5 pctlpre = loss;
run;
proc printto; run;
%mend;
%mc(cycle = 10000);

References:
1. Paolo Brandimarte. Numerical methods in finance. John Wiley & Sons. 2002.
2. SAS/ETS 9.2 user’s guide. SAS Publishing. 2008

Wednesday, May 25, 2011

Semiparametric methods in predicting loss given default


Sparse data is a big concern in building models for loss given default (LGD) for corporate risk. For LGD, most predictors are instrument-related, firm-specific, macroeconomic and industry-specific variables, while the costs to collect such data may be relatively high. In one example of Gunter and Peter’s book, industry-wise average default rate, yearly average default rate, firm-wise leverage rate were applied to predict LGD. To increase the predictability, the painful transformation of LGD was conducted [Ref. 1]. Actually some non-linear models could be considered.

In a conference paper about consumer risk scoring, Wensui mentioned that generalized additive model (GAM) provides the ability to detect the nonlinear relationship between risk behavior and predictors [Ref. 2]. In this example, we are possibly more interested in estimating the parameter of firm-specific leverage (lev). Thus I used Proc GAM to estimate this variable’s parameter while smoothing other predictors by LOESS functions. In addition, I used Proc LOESS to realize the nonparametric regression. Comparing the two methods in a series plot, their predictions of LGD are pretty close. As the result, Proc GAM may provide us an insightful tool to construct meaningful semiparametric regression to predict LGD.

References:
1. Gunter Loeffler and Peter Posch. ‘Credit Risk Modeling using Excel and VBA’. The 2nd edition. Wiley. 2011
2. Wensui Liu, Chuck Vu, Jimmy Cela.‘Generalizations of Generalized Additive Model (GAM): A Case of Credit Risk Modeling’. SAS Global 2009

data _tmp01;
infile "h:\raw_data.txt" delimiter = '09'x missover dsd firstobs=2;
informat lgd lev lgd_a i_def 8.3;
label lgd = 'Real loss given default'
lev = 'Leverage coefficient by firm'
lgd_a = 'Mean default rate by year'
i_def = 'Mean default rate by industry';
input lgd lev lgd_a i_def;
run;

ods html gpath = 'h:\' style = money;
ods graphics on;
proc loess data=_tmp01;
model lgd = lev lgd_a i_def / scale = sd select = gcv degree = 2;
score;
ods output scoreresults = predloess;
run;

proc gam data= _tmp01 plots = components(clm);
model lgd = loess(i_def) loess(lgd_a) param(lev) / method = gcv;
output out = predgam p = pbygam;
run;
ods graphics off;

data _tmp02;
merge predloess predgam;
keep p_LGD LGD pbygamLGD obs;
label p_LGD = 'Prediction by Proc LOESS'
pbygamLGD = 'Prediction by Proc GAM';
run;

proc sgplot data = _tmp02;
series x = obs y = lgd ;
series x = obs y = p_lgd;
series x = obs y = pbygamlgd;
yaxis label = 'loss given default';
run;
ods html close;

Thursday, May 19, 2011

Create transition matrices by cohort approach and hazard rate approch

------------The cohort approach macro has a bug and I am working on it ------------------------------------
Cohort approach is a widely-used method in creating transition matrices for evaluation of credit risk. In the example below, I made a macro to automate this process. Although the code is for one-year period transition matrices, scorecards for multiple-year period can simply derive from it.
%macro cohort(filepath = );
   options mlogic mprint;
   data _tmp01;
      infile "&filepath\data.txt" delimiter='09'x missover dsd firstobs=2 ;
      informat id best12.; informat date mmddyy10.; informat rawrating $5.;
      input id date rawrating;
   run;
   proc format;
      value $rating
      'NR   ' =   0
      'AAA'   =   1
      'AA+'   =   2
      'AA   ' =   2
      'AA-'   =   2
      'A+   ' =   3
      'A   '  =   3
      'A-   ' =   3
      'BBB+'  =   4
      'BBB'   =   4
      'BBB-'  =   4
      'BB+'   =   5
      'BB   ' =   5
      'BB-'   =   5
      'B+   ' =   6
      'B   '  =   6
      'B-   ' =   6
      'CCC+'  =   7
      'CCC'   =   7
      'CCC-'  =   7
      'CC   ' =   7
      'C   '  =   7
      'D   '  =   8
   ;;;
   run;
   data _tmp02;
      set _tmp01;
      year = year(date);
      rating = put(rawrating, $rating.);
      drop rawrating;
   run;

   proc sort data = _tmp02;
      by id year date;
   run;
   data _tmp03;
      set _tmp02;
      by id year;
      if last.year;
      drop date;
   run;
   proc sql;
      select min(year) into :y_start from _tmp03;
      select max(year)-1 into :y_end from _tmp03;
   quit;

   proc transpose data = _tmp03 out = _tmp04 prefix = year;
      by id;
      var rating;
      id year;
   run;
   data _tmp05;
      set _tmp04;
      %do num1 = &y_start %to &y_end;
         %let num2 = %eval(&num1 + 1);
         if year&num1 not in (., 0, 8) and year&num2 = . then year&num2 = year&num1;
      %end;
   run;

   proc transpose data = _tmp05 out = _tmp06;
      by id;
      var year:;
   run;
   proc sort data = _tmp06;
      by id _name_;
      where rating ne .;
   run;
   data _tmp07;
      set _tmp06;
      year = input(substr(_name_, 5, 4), 4.);
      drop _name_;
   run;

   proc sql;
      create table _tmp08 as
      select a.id, a.year as start_year, a.rating as start_rating,
            b.year as end_year, b.rating as end_rating
      from _tmp07(where=(rating not in (0, 8))) as a, _tmp07 as b
      where a.id = b.id and b.year - a.year = 1
   ;quit;

   proc freq data = _tmp08;
      table start_rating * end_rating / nofreq nocol nopercent;
      ods output crosstabfreqs = _tmp09;
   run;
   data _tmp10;
      set _tmp09;
      length end_rating_char $3;
      end_rating_char = cats(end_rating, '');
      if end_rating = 0 then end_rating_char = 'NR';
      rowpercent = rowpercent / 100;
      where RowPercent is not missing;
   run;

   ods html file = "&filepath\tmatrice.xls" style = minimal;
   title; footnote;
   proc report data = _tmp10 nowd headline split = '|';
      col start_rating end_rating_char,RowPercent;
      define start_rating / group 'Rating at the|start of one year';
      define end_rating_char / across 'Rating at the end of one year';
      define rowpercent / '' format = percent8.2;
   run;
   ods html close;
%mend cohort;
For hazard rate approach, the generator matrix can be obtained by codes like:

data one;
    format date date9.;
    input Id Date : date9. Rating;
    datalines;
    1    30-May-00    7
    1    31-Dec-00    6
    2    21-May-03    1
    3    30-Dec-99    5
    3    30-Oct-00    6
    3    30-Dec-01    5
    4    30-Dec-01    5
    4    30-May-02    6
    5    24-May-00    2
    5    30-May-01    3
    5    30-Oct-01    2
    6    30-Dec-99    4
    6    30-Dec-01    4
    7    30-Dec-02    4
    7    23-Jun-03    5
    7    30-Dec-03    6
    7    21-May-04    5
    8    30-Dec-02    3
    9    21-May-00    2
    9    30-Dec-00    0
    10   30-Dec-04    5
    11  30-Dec-99    5
    11  30-Dec-01    6
    11  21-May-02    7
    11  30-Sep-02    8
    12  30-Dec-00    0
    13  30-Dec-99    4
    13  30-May-03    5
    14  21-Sep-99    5
    14  30-Dec-99    5
    14  30-Dec-01    5
    14  26-May-02    6
    14  21-May-04    5
    ;
run;

proc sort data = one;
   by id date;
run;

proc sql noprint;
   select max(date) into: maxdat
   from one;
quit;
data two(drop=nextid nextdat);
   merge one one(firstobs = 2 rename=(rating=nextrat id=nextid date=nextdat));
   if id ne nextid then nextrat = rating;
   if id = nextid then spell = nextdat - date;
   else spell = &maxdat - date;
   spell = spell / 365;
run;

proc sql;
   create table three as
   select a.*, a.count / b.sumspell as ratio
   from (select rating, nextrat, count(*) as count
           from two group by rating, nextrat) as a,
        (select rating, sum(spell) as sumspell
           from two group by rating) as b
   where a.rating = b.rating
;quit;

proc sql;
   create table four as
   select rating, rating as nextrat, sum(ratio) as sumratio
   from three
   where rating ne nextrat
   group by rating
;quit;

data five;
   merge three four;
   by rating nextrat;
   if rating = nextrat then ratio = -sumratio;
run;

options ls = 256 nocenter missing = 0;
proc report data = five nowd headline;
   title 'Generator maxtrix';
   columns rating nextrat,ratio ;
   define rating / group 'From';
   define nextrat / across 'To';
   define ratio / ' ' format = 8.2;
run;

Wednesday, May 18, 2011

A macro calls random forest in SAS




SASHELP.CARS, with 428 observations and 15 variables, is a free dataset in SAS for me to exercise any classification methods. I always have the fantasy to predict which country a random car is manufactured by, such as US, Japan or Europe. After trying many methods in SAS, including decision tree, logistic regression, k-NN and SVM, I eventually found that random forest, an ensemble classifier of many decision trees [Ref. 1], can slash the overall misclassification rate to around 25%. The SAS code is powered by R’s package ‘randomForest’. In my tiny experiment, it seems that the ensemble of 100 trees would achieve optimum effect.

The concept of random forest was first raised by Leo Breiman and Adele Cutler [Ref. 2]. They also developed elegant Fortran codes for it. Andy Liaw in Merck did a fantastic job to port those Fortran codes into R [Ref. 3]. Now everybody with a computer can use this state of the art classification method for fun or work.

Reference:
1. Albert Montillo. ‘Random Forest’. http://www.ist.temple.edu/
2. Leo Breiman and Adele Cutler. http://stat-www.berkeley.edu/users/breiman/RandomForests/
3. Andy Liaw. ‘randomForest: Breiman and Cutler's random forests for classification and regression’. http://cran.r-project.org/web/packages/randomForest/index.html

/*******************READ ME*********************************************
* - A macro calls random forest in SAS by R -
*
* SAS VERSION: 9.1.3
* R VERSION: 2.13.0 (library: 'randomForest', 'foreign')
* DATE: 18may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

****************(1) MODULE-BUILDING STEP********************************;
%macro rf(train = , validate = , result = , targetvar = , ntree = ,
tmppath = , rpath = );
/*****************************************************************
* MACRO: rf()
* GOAL: invoke randomForest in R to perform random forest
* classification in SAS
* PARAMETERS: train = dataset for training
* validate = dataset for validation
* result = dataset after prediction
* ntree = number of trees specified
* targetvar = target variable
* tmppath = temporary path for exchagne files
* rpath = installation path for R
*****************************************************************/
proc export data = &train outfile = "&tmppath\sas2r_train.csv" replace;
run;
proc export data = &validate outfile = "&tmppath\sas2r_validate.csv" replace;
run;
proc sql;
create table _tmp0 (string char(200));
insert into _tmp0
set string = 'train=read.csv("sas_path/sas2r_train.csv",header=T)'
set string = 'validate=read.csv("sas_path/sas2r_validate.csv",header=T)'
set string = 'sink("sas_path/result.txt", append=T, split=F)'
set string = 'require(randomForest,quietly=T)'
set string = 'model=randomForest(sas_targetvar~ .,data=train,'
set string = 'do.trace=10,ntree=sas_treenumber,importance=T)'
set string = 'predicted = predict(model,newdata=validate,type="class")'
set string = 'result=as.data.frame(predicted)'
set string = 'importance(model)'
set string = 'table(validate$sas_targetvar, predicted)'
set string = 'require(foreign, quietly=T)'
set string = 'write.foreign(result,"sas_path/r2sas_tmp.dat",'
set string = '"sas_path/r2sas_tmp.sas",package="SAS")';
quit;
data _tmp1;
set _tmp0;
string = tranwrd(string, "sas_treenumber", "&ntree");
string = tranwrd(string, "sas_targetvar", propcase("&targetvar"));
string = tranwrd(string, "sas_path", translate("&tmppath", "/", "\"));
run;
data _null_;
set _tmp1;
file "&tmppath\sas_r.r";
put string;
run;

options xsync xwait;
x "cd &rpath";
x "R.exe CMD BATCH --vanilla --slave &tmppath\sas_r.r";

data _null_;
infile "&tmppath\result.txt";
input;
if _n_ = 1 then put "NOTE: Statistics by R";
put _infile_;
run;

%include "&tmppath\r2sas_tmp.sas";
data &result;
set &validate;
set rdata;
run;
%mend rf;

****************(2) TESTING STEP****************************************;
%rf(train = cars_train, validate = cars_validate, result = cars_result,
targetvar = origin, ntree = 100, tmppath = c:\tmp,
rpath = D:\Program Files\R\R-2.13.0\bin);

****************END OF ALL CODING***************************************;

Saturday, May 14, 2011

Macros communicate SQLite and SAS without ODBC


SQLite is an open-source relationship database management system with full functionality [Ref.1]. The light-weight (300k+ size) and zero configuration features distinguish it from its’ 800-pound counterparts like Oracle or MySQL. Thanks to the rise of mobile devices (plus SQLite-embedded Firefox), SQLite will probably be seen everywhere pretty soon.

I just love SQLite, since SQLite helped me learn not only writing SQL codes on Windows and Linux, but also managing complicated databases. Both Python and R have nice support for SQLite. And I always expect to implement SQLite as a frontend or backup for SAS. The shortcut is to apply some 3rd-party SQLite’s ODBC drivers [Ref. 2]. However, those drivers never worked very well on my workstations. To bypass the ODBC method, Wensui designed a macro to use SQLite’s ‘.dump’ operator to generate SQL file for SAS [Ref. 3]. To establish a two-way communication, I wrote two macros below to export SQLite’s table to SAS, and vice versa. The macros utilized tab-delimited text as medium, and SQLite’s batch mode to execute the script on a PC.

Reference:
1. Grant Allen, Mike Owens. ‘The Definitive Guide to SQLite’. Apress Publishing.
2. SQLite ODBC Driver. http://www.ch-werner.de/sqliteodbc/
3. Wensui Liu. ‘Sas Macro Importing Sqlite Data Table Without Odbc’.

/*******************READ ME*********************************************
* - Macros communicate SQLite and SAS without ODBC -
*
* SAS VERSION: 9.1.3
* SQLITE VERSION: 3.7.4
* DATE: 14may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

****************(1) MODULE-BUILDING STEP********************************;
******(1.1) BUILD A MACRO FROM SAS TO SQLITE****************************;
%macro sas2sqlite(sastable = , path = , database = );
/*****************************************************************
* MACRO: sas2sqlite()
* GOAL: output a dataset in SAS to a table in SQLite
* PARAMETERS: sastable = dataset in SAS for SQLite
* path = destinate file path for SQLite database
* database = name of SQLite database
*****************************************************************/
proc export data = &sastable outfile = "&path\sas_2_sqlite.txt" dbms = tab
repalce;
putnames = no;
run;

ods listing close;
ods output variables = _varlist;
proc contents data = &sastable;
run;
proc sort data = _varlist;
by num;
run;

data _tmp01;
set _varlist;
if lowcase(type) = 'num' then vartype = 'real';
else if lowcase(type) = 'char' then vartype = 'text';
run;
proc sql noprint;
select trim(variable) ||' '|| trim(vartype)
into: table_value separated by ', '
from _tmp01
;quit;

proc sql;
create table _tmp02 (string char(800));
insert into _tmp02
set string = '.stats on'
set string = 'create table sas_table(sas_table_value);'
set string = '.separator "\t"'
set string = ".import 'sas_path\sas_2_sqlite.txt' sas_table"
;quit;

data _tmp03;
set _tmp02;
string = tranwrd(string, "sas_table_value", "&table_value");
string = tranwrd(string, "sas_table", "&sastable");
string = tranwrd(string, "sas_path", "&path");
run;
data _null_;
set _tmp03;
file "&path\sas_2_sqlite.sql";
put string;
run;
options noxsync noxwait;
x "sqlite3 -init &path\sas_2_sqlite.sql &path\&database";

proc datasets;
delete _:;
quit;
ods listing;
%mend;

******(1.2) BUILD A MACRO FROM SQLITE TO SAS***************************;
%macro sqlite2sas(sqlitetable = , path = , database = );
/*****************************************************************
* MACRO: sqlite2sas()
* GOAL: output a table in SQLite to a dataset in SAS
* PARAMETERS: sqlitetable = table in SQLite for SAS
* path = target file path for SQLite database
* database = name of SQLite database
*****************************************************************/
proc sql;
create table _tmp0 (string char(800));
insert into _tmp0
set string = ".output 'output_path\sqlite_2_sas.txt' "
set string = '.separator "\t" '
set string = '.headers on'
set string = 'select * from sqlite_table;'
set string = '.output stdout'
;quit;
data _tmp1;
set _tmp0;
string = tranwrd(string, "sqlite_table", "&sqlitetable");
string = tranwrd(string, "output_path", "&path");
run;
data _null_;
set _tmp1;
file "&path\sas_2_sqlite.sql";
put string;
run;

options noxsync noxwait;
x "sqlite3 -init &path\sas_2_sqlite.sql &path\&database ";

proc import datafile = "&path\sqlite_2_sas.txt" out = &sqlitetable
dbms = dlm replace;
delimiter = '09'x;
guessingrows = 10000;
run;
proc datasets nolist;
delete _:;
run;
%mend;

****************(2) TESTING STEP****************************************;
******(2.1) TESTING THE FIRST MACRO*************************************;
data iris;
set sashelp.iris;
run;
%sas2sqlite(sastable = iris, path = c:\tmp, database = sas_sqlite.sqlite);

******(2.2) TESTING THE SECOND MACRO************************************;
proc datasets;
delete iris;
quit;
%sqlite2sas(sqlitetable = iris, path = c:\tmp, database = sas_sqlite.sqlite);

****************END OF ALL CODING***************************************;

Wednesday, May 11, 2011

SAS makes spreadsheet for reporting


Excel is the last stop in the pipeline of my daily work. Most clients of mine like colorful multi-sheet spreadsheets more than plain CSV files (I guess they all use Windows). I am not a power user of Excel, and honestly I am a little afraid of it: sometimes it got wrong while I accidently dragged the cells to make duplicates. As the result, I tend to have everything ready in SAS and treat Excel as the close box. At the beginning I preferred to use the high-tech ODBC engine to exchange data; eventually I gave up when I found that my IT support doesn’t know how to add SAS/ACCESS module though we have the license. The EXPORT procedure is widely used, but it does not allow traffic lighting, which labels value with distinctive colors. MSOFFICE2K, an ODS tagset from SAS to Excel, ceased to work [Ref. 1]. Besides those methods, SAS has some alternative ways to prepare pretty Excel reports. Each year Vincent in SAS updates ExeclXP, a 6k-line SAS code, for multi-sheeting output by ODS tagset [Ref. 2]. This year in his paper, Romain elaborated possibly all the 8 methods, and no wonder his amazing summarization won the best paper award in the code’s corner section [Ref. 3].

Three routines in SAS satisfy 99% of my requirements for an Excel report. First, SAS‘ HTML ODS tagset can directly write XLS file. It is good enough for any single-sheet spreadsheet. Second, thanks to Vincent, ExcelXP provides the perfect tool to generate multi-sheet spreadsheets. The only place where I need to do extensive coding is to find optimal values for the ‘absolute_column_width’ setting in Excel. Hope next year the code would be improved to address this particular question. Third, if part of the cells in Excel needs calculation, I would like to call DDE, a old but still robust technology, to read data into SAS and later write back (I also use R as a scientific calculator to do algebra).

In conclusion, the report by codes can be reproduced more easily than that by many point-and-click operations. That’s probably why I like SAS more than Excel.

Reference:
1. ‘Vanilla output using ODS’. SAS-L. 05May2011
2. Vincent DelGobbo. ‘Creating Stylish Multi-Sheet Microsoft Excel Workbooks the Easy Way with SAS’. SAS Global 2011.
3. Romain Miralles. ‘Creating an Excel report: A comparison of the different techniques’. SAS Global 2011. http://support.sas.com/rnd/papers/index.html

/*******************READ ME*********************************************
* - SAS MAKES SPREADSHEET FOR REPORTING -
*
* SAS VERSION: SAS 9.1.3
* DATE: 11may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

****************(1) MULTIPLE SHEET REPORTING FOR EXCEL *****************;
******(1.1) FIND EXCEL'S COLUMN WIDTH AND MAKE TRAFFIC LIGHTS***********;
%macro excelcol(data = , groupvar = );
/*****************************************************************
* MACRO: excelcol()
* GOAL: find the optimum column size for EXCEL
* PARAMETERS: data = data used for output
* groupvar = variable for sheet separation
*****************************************************************/
%global var_list len_list;
ods listing close;
ods output variables = _varlist;
proc contents data = &data;
run;
proc sort data = _varlist;
by format num;
run;
data _varlist;
set _varlist;
len_var = length(variable);
len_lab = length(label);
max_len = max(of len:);
run;

proc sql;
select variable into: var_list separated by ' '
from _varlist
where lowcase(variable) ne "&groupvar"
;
select max_len into: len_list separated by ','
from _varlist
where lowcase(variable) ne "&groupvar"
;
quit;
ods listing;
%mend excelcol;
%excelcol(data = sashelp.cars, groupvar = origin);

proc format;
value price
40000 - high = '#FFFFCC'
26000 -< 40000 = 'Yellow'
other = '#FF9900';
run;
ods listing;

******(1.2) WRITE FINAL REPORT TO EXCEL********************************;
proc template;
define style styles.xlsansprinter;
parent = styles.sansprinter;
style header from header /
font_size = 10pt just = center vjust = bottom;
end;
run; quit;

ods tagsets.excelxp path="c:\" file="cars.xml" style=xlsansprinter
options(sheet_interval="bygroup" sheet_label=" "
suppress_bylines="yes" autofilter="2-14"
absolute_column_width="&len_list");
title; footnote;
proc report data = sashelp.cars nowd split='*';
by make;
column &var_list;
define model / style(column) = [just=center font_weight=bold];
define invoice / style(column) = [foreground=lime];
compute model;
rownum + 1;
if (mod(rownum, 2) ne 0)
then call define(_row_, 'style', 'style = [background=#99ccff]');
endcomp;
compute msrp;
call define(_col_, 'style', 'style = [background=price.]');
endcomp;
compute before _page_;
line "Made by &sysuserid on &sysday., &sysdate";
endcomp;
run; quit;
ods tagsets.excelxp close;
ods listing;

****************(2) SINGLE SHEET REPORTING FOR EXCEL *****************;
ods listing close;
ods html file = "c:\cars2.xls" gpath = "c:\" style = minimal;
title; footnote;
proc print data = sashelp.cars;
id make;
var &var_list;
var msrp / style = [background=price.];
run;

proc gplot data = sashelp.cars;
plot msrp * invoice;
run;
ods html close;
ods listing;

****************(3) DDE BETWEEN SAS AND EXCEL ************************;
******(3.1) EXTRACT DATA FROM EXCEL TO COMPUTE IN SAS*****************;
filename _infile dde 'clipboard';
data _tmp01;
infile _infile notab missover dlm = '09'x dsd;
informat var1 var2 dollar8.;
input var1 var2;
run;

data _tmp02;
set _tmp01;
mean = mean(of var:);
std = std(of var:);
run;

******(3.2) WRITE STATISTICS BACK TO EXCEL****************************;
options noxwait noxsync;
x "c:\cars2.xls";

filename _outfile dde 'excel|c:\[cars2.xls]cars2!r2c17:r28c18';
data _null_;
set _tmp02;
file _outfile notab dlm='09'x;
format mean std dollar8.;
put mean std;
run;

****************END OF ALL CODING***************************************;

Friday, May 6, 2011

Multidimensional scaling for ZIP codes clustering


Multidimensional scaling maps the distances among multiple objects in a two or more dimensional space. This method is getting hotter in analyzing social network, since many SNS website now offer handy tools to visualize the social connections for the users. SAS’s MDS procedure, based on such an algorithm, is a fascinating tool. Larry [Ref. 1] utilized it to map SAS-L, an email list, to a circle shape, by extracting threads and email addresses. Proc MDS is also used to reflect the perceptions of customers to perceptual maps.

In business, some direct marketing activities need to scale down the levels of zip codes. For example, Texas has 2650 ZIP codes. Sometimes it is useful to divide them into manageable sectors. SAS 9.2 has built-in ZIP code and map datasets. And the ‘zipcitydistance’ function lookups the distance between any pair of ZIP codes. At the beginning, a 2650*2650 matrix, based on the distances between any two ZIP codes in Texas, was constructed. Then according to this matrix, Proc MDS calculated the two dimension variables. A following clustering procedure separated them into 5 clusters. Proc GMAP allows generating customized map [Ref. 2]. Thus by it, I annotated those dots back onto a physical map. The comparison between the two images shows the reconstructed relative locations are pretty accurate, though the map's angle by Proc MDS is not very much correct.

Reference:
1. Larry Hoyle. ‘Visualizing Two Social Networks Across Time with SAS’. SAS Global 2009.
2. Darrell Massengill and Jeff Phillips. ‘Tips and Tricks IV: More SAS/GRAPH Map Secrets’. SAS Global 2009.

/*******************READ ME*********************************************
* - MULTIDIMENSIONAL SCALING FOR ZIP CODES CLUSTERING -
*
* SAS VERSION: SAS 9.2.2
* DATE: 07may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

****************(1) RETRIEVE MAP AND ZIP CODE DATA IN TEXAS FROM SAS ***;
data txzip;
set sashelp.zipcode;
where statecode = "TX";
run;

data txmap;
length x y 8;
set maps.counties;
where state = 48;
run;

****************(2) CREATE A ZIP-TO-ZIP DISTANCE MATRIX*****************;
proc sql;
create table zip01 as
select a.zip as zipa, b.zip as zipb,
zipcitydistance(zipa, zipb) as distance
from txzip as a, txzip as b
;quit;

proc transpose data = zip01 out = zip02 prefix = var;
by zipa;
id zipb;
var distance;
run;

****************(3) CONDUCT MDS AND CLUSTERING WITH 5 CLUSTERS**********;
proc mds data = zip02 level = absolute out = mds_done ;
id zipa;
run;

proc fastclus data = mds_done maxc = 5 out = clus_done;
var dim:;
where _name_ is not missing;
run;

proc sgscatter data = clus_zip;
plot dim1 * dim2 / group = cluster grid;
run;

****************(4) TRANSFORM ZIP CODE TO GEOGRAPHIC DATA FOR ANNOTATION**;
proc sql;
create table clus_zip as
select a.zipa as zip, a.cluster, b.x, b.y
from clus_done as a left join txzip as b
on a.zipa = b.zip
;quit;

data clus_zip1;
retain anno_flag 1;
set clus_zip;
x = -x * atan(1) / 45;
y = y * atan(1) / 45;
length function style color text $8;
function = 'label';
xsys='2'; ysys='2'; hsys='3';
when='A'; style='special'; text='L';
size = 2; position = 'E';
if cluster = 1 then color = 'blue';
else if cluster = 2 then color = 'red';
else if cluster = 3 then color = 'purple';
else if cluster = 4 then color = 'green';
else color = 'yellow';
run;

****************(5) MAP CLUSTERED ZIP CODES GEOGRAPHICALLY ******************;
data combine;
set txmap clus_zip1;
run;

proc gproject data = combine out = combined dupok;
id county;
run;

data txmap anno_dots;
set combined;
if anno_flag > 0 then output anno_dots;
else output txmap;
drop anno_flag;
run;

ods html gpath = 'c:\';
goptions reset=all dev=gif xpixels = 1280 ypixels = 1024;
proc gmap data = txmap map = txmap anno = anno_dots;
id county;
choro state / nolegend;
run;
quit;
ods html close;

****************END OF ALL CODING***************************************;
mds_to_post

Tuesday, May 3, 2011

Support vector machine in SAS by R


I just recently discovered endless fun to synchronize SAS and R to do something meaningful. Yep, I am a SAS programmer: during the day time, I use SAS for my work; at the evening, I use R for entertainment. It is always exciting to hook up them together. How about a SAS/R module, like SAS/STAT or SAS/BASE, in the future?

Some SAS programmers or SAS ‘developers’ already utilized coding to communicate SAS and R [Ref. 1 and 2] (thanks to Rick Wicklin’s mentioning). Since R can write dataset in SAS code (the ‘foreign’ package) and SAS can use call R in X command to do batch execution, so far I didn’t find much difficulty to use SAS as a GUI for R.

Support vector machine (SVM) is a cool and fancy classification method. It is said that a secret SVM procedure is already running under SAS Enterpriser Miner (I did not get a chance to try it yet). The package ‘e1071’ in R provides the state-of-art protocols for SVM classification. Then I made a small macro to call it in SAS, which performs like a typical SAS procedure. Hope everyone who is doing data could enjoy it.

Reference:
1.Philip R Holland ‘SAS to R to SAS’. Holland Numerics Limited.
2.Phil Rack. ‘A Bridge to R for SAS Users’. www.MineQuest.com

/*******************READ ME*********************************************
* - SUPPORT VECTOR MACHINE FOR CLASSIFICATION IN SAS BY R -
*
* SAS VERSION: SAS 9.1.3
* R VERSION: R 2.13.0 (library: 'e1071', 'foreign')
* DATE: 03may2011
* AUTHOR: hchao8@gmail.com
*
****************END OF READ ME******************************************/

****************(1) MODULE-BUILDING STEP******************;
%macro svm(train = , validate = , result = , targetvar = , tmppath = , rpath = );
/*****************************************************************
* MACRO: svm()
* GOAL: invoke e1071 in R to perform support vector machine
* classification in SAS
* PARAMETERS: train = dataset for training
* validate = dataset for validation
* result = dataset after prediction
* targetvar = target variable
* tmppath = temporary path for exchagne files
* rpath = installation path for R
*****************************************************************/
proc export data = &train outfile = "&tmppath\sas2r_train.csv" replace;
run;
proc export data = &validate outfile = "&tmppath\sas2r_validate.csv" replace;
run;

proc sql;
create table _tmp0 (string char(80));
insert into _tmp0
set string = 'train=read.csv("sas_path/sas2r_train.csv",header=T)'
set string = 'validate=read.csv("sas_path/sas2r_validate.csv",header=T)'
set string = 'require(e1071,quietly=T)'
set string = 'model=svm(sas_targetvar ~ . ,data=train)'
set string = 'predicted=predict(model,newdata=validate,type="class")'
set string = 'result=as.data.frame(predicted)'
set string = 'require(foreign, quietly=T)'
set string = 'write.foreign(result,"sas_path/r2sas_tmp.dat",'
set string = '"sas_path/r2sas_tmp.sas",package="SAS")';
quit;
data _tmp1;
set _tmp0;
string = tranwrd(string, "sas_targetvar", propcase("&targetvar"));
string = tranwrd(string, "sas_path", translate("&tmppath", "/", "\"));
run;
data _null_;
set _tmp1;
file "&tmppath\sas_r.r";
put string;
run;

options xsync xwait;
x "cd &rpath";
x "R.exe CMD BATCH --vanilla --slave &tmppath\sas_r.r";

data _null_;
infile "&tmppath\sas_r.r.rout";
input;
if _n_ = 1 then put "NOTE: Time used by R";
put _infile_;
run;

%include "&tmppath\r2sas_tmp.sas";
data &result;
set &validate;
set rdata;
run;

proc datasets nolist;
delete _: rdata;
quit;
%mend;

****************(2) TESTING STEP******************;
******(2.1) BUILD A PARTITION MACRO TO SEPARATE TESTING DATASET*************;
%macro partbyprop2(data = , targetvar = , samprate = , seed = , train = , validate = );
/**************************************************************
* MACRO: partbyprop2()
* GOAL: partition dataset by target variable's proportion
* and choose numerical variables for classification
* PARAMETERS: data = input dataset
* targetvar = target variable
* samprate = ratio of train v.s. validate datasets
* seed = random seed for sampling
**************************************************************/
ods listing close;
ods output variables = _varlist;
proc contents data = &data;
run;
proc sql;
select variable into: num_var separated by ' '
from _varlist
where lowcase(type) = "num";
quit;

proc sort data = &data out = _tmp1;
by &targetvar;
run;

proc surveyselect data = _tmp1 samprate = &samprate
out = _tmp2 seed = &seed outall;
strata &targetvar / alloc = prop;
run;

data &train &validate;
set _tmp2;
format _numeric_ best13.;
keep &num_var &targetvar;
if selected = 0 then output &train;
else output &validate;
run;

proc datasets;
delete _:;
quit;
ods listing;
%mend;

******(2.2) DIVIDE SASHELP.IRIS DATASET INTO TWO EQUAL PARTS*************;
%partbyprop2(data = sashelp.iris, targetvar = species, samprate = 0.5, seed = 20110503,
train = iris_train, validate = iris_validate);

******(2.3) USE THE SVM MACRO*************;
%svm(train = iris_train, validate = iris_validate, result = iris_result,
targetvar = species, tmppath = c:\tmp, rpath = c:\Program Files\R\R-2.13.0\bin);

****************(3) VISUALIZATION STEP******************;
data iris_visual;
set iris_result;
length color shape $8.;
predvalue = put(predicted, predictd.);
if species = "Setosa" then shape = "club";
if species = "Versicolor" then shape = "diamond";
if species = "Virginica" then shape = "spade";
if predvalue = "Setosa" then color = "blue";
if predvalue = "Versicolor" then color = "red";
if predvalue = "Virginica" then color = "green";
run;

ods html style = harvest
proc g3d data = iris_visual;
scatter PetalLength * PetalWidth = SepalLength /
color = color shape = shape;
run;quit;
ods html close;

****************END OF ALL CODING***************************************;
 
Link of r2sas_tmp.sas