Sunday, August 10, 2014

Translate SAS's sas7bdat format to SQLite and Pandas

Thanks Jared Hobbs’ sas7bdat package, Python can read SAS’s data sets quickly and precisely. And it will be great to have a few extension functions to enhance this package with SQLite and Pandas.
The good things to transfer SAS libraries to SQLite:
  1. Size reduction:
    SAS’s sas7bdat format is verbose. So far successfully loaded 40GB SAS data to SQLite with 85% reduction of disk usage.
  2. Save the cost to buy SAS/ACCESS
    SAS/ACCESS costs around $8,000 a year for a server, while SQLite is accessible for most common softwares.
The good things to transfer SAS data set to Pandas:
  1. Pandas’ powerful Excel interface:
    Write very large Excel file quickly as long as memory can hold data.
  2. Validation of statistics
    Pandas works well with statsmodels and scikit-learn. Easy to validate  SAS’s outputs. 
import sqlite3
import glob
import sas7bdat
class SASexport(sas7bdat.SAS7BDAT):
def head(self, n=5):
for i, row in enumerate(self.readData()):
print row
if i == n:
break
if n > self.header.rowcount:
print 'n exceeds the total number and here shows all rows!'
def meta(self):
print self.header
def to_sqlite(self, sqlitedb):
if sqlitedb is None or not isinstance(sqlitedb, str):
sqlitedb = ':memory'
print 'not valid output name and instead use in-memory database'
cols = self.header.cols
strs = [''] * len(cols)
for i, n in enumerate(cols):
if n.attr.type == "numeric":
strs[i] = n.name + ' real'
else:
strs[i] = n.name + ' varchar({})'.format(n.attr.length)
table = self.header.dataset.title()
cmd1 = "CREATE TABLE {} ({})".format(table, ', '.join(strs))
cmd2 = 'INSERT INTO {} VALUES ( {} )'.format(table,
','.join(['?']*len(cols)))
conn = sqlite3.connect(sqlitedb)
c = conn.cursor()
for i, line in enumerate(self.readData()):
if i == 0:
c.execute('DROP TABLE IF EXISTS {}'.format(table))
c.execute(cmd1)
else:
c.execute(cmd2, line)
conn.commit()
c.close()
def to_pandas(self):
try:
import pandas
except ImportError:
raise ImportError('No pandas moduleis available and this'
'method cannot used')
rst = []
for i, x in enumerate(self.readData()):
if i == 0:
colnames = x
else:
rst.append(x)
df = pandas.DataFrame(data = rst, columns = colnames)
return df
class SASbatchexport:
def __init__(self, directory):
if directory is None or not isinstance(directory, str):
raise ValueError('SAS library path has to be specified')
self.directory = directory
def to_sqlitedb(self, dest=None):
"""Export all SAS data sets to a SQLite database"""
if dest is None or not isinstance(dest, str):
print 'The output SQLite db will be name as SASOUTPUT.db'
dest = 'SASOUTPUT.db'
s = self.directory + '/*.sas7bdat'
for sasfile in glob.glob(s):
_data = SASexport(sasfile)
_data.to_sqlite(dest)
print 'SAS dataset {} has been successfully exported'.format( \
_data.header.dataset)
if __name__ == '__main__':
# Test the export feature
data = SASexport('d:/google/onedrive/class.sas7bdat')
data.head(30)
data.meta()
data.to_sqlite('d:/tmp/test3.db')
data = SASexport('d:/google/onedrive/prdsal2.sas7bdat')
data.head()
df = data.to_pandas()
print df
# Test the batch export feature
lib = SASbatchexport('d:/google/onedrive/')
lib.to_sqlitedb('d:/tmp/test.db')
view raw demo.py hosted with ❤ by GitHub

No comments:

Post a Comment