Thanks Jared Hobbs’ sas7bdat package, Python can read SAS’s data sets quickly and precisely. And it will be great to have a few extension functions to enhance this package with SQLite and Pandas.
The good things to transfer SAS libraries to SQLite:
- Size reduction:
SAS’s sas7bdat format is verbose. So far successfully loaded 40GB SAS data to SQLite with 85% reduction of disk usage. - Save the cost to buy SAS/ACCESS
SAS/ACCESS costs around $8,000 a year for a server, while SQLite is accessible for most common softwares.
The good things to transfer SAS data set to Pandas:
- Pandas’ powerful Excel interface:
Write very large Excel file quickly as long as memory can hold data. - Validation of statistics
Pandas works well withstatsmodels
andscikit-learn
. Easy to validate SAS’s outputs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 | |
import glob | |
import sas7bdat | |
class SASexport(sas7bdat.SAS7BDAT): | |
def head(self, n=5): | |
for i, row in enumerate(self.readData()): | |
print row | |
if i == n: | |
break | |
if n > self.header.rowcount: | |
print 'n exceeds the total number and here shows all rows!' | |
def meta(self): | |
print self.header | |
def to_sqlite(self, sqlitedb): | |
if sqlitedb is None or not isinstance(sqlitedb, str): | |
sqlitedb = ':memory' | |
print 'not valid output name and instead use in-memory database' | |
cols = self.header.cols | |
strs = [''] * len(cols) | |
for i, n in enumerate(cols): | |
if n.attr.type == "numeric": | |
strs[i] = n.name + ' real' | |
else: | |
strs[i] = n.name + ' varchar({})'.format(n.attr.length) | |
table = self.header.dataset.title() | |
cmd1 = "CREATE TABLE {} ({})".format(table, ', '.join(strs)) | |
cmd2 = 'INSERT INTO {} VALUES ( {} )'.format(table, | |
','.join(['?']*len(cols))) | |
conn = sqlite3.connect(sqlitedb) | |
c = conn.cursor() | |
for i, line in enumerate(self.readData()): | |
if i == 0: | |
c.execute('DROP TABLE IF EXISTS {}'.format(table)) | |
c.execute(cmd1) | |
else: | |
c.execute(cmd2, line) | |
conn.commit() | |
c.close() | |
def to_pandas(self): | |
try: | |
import pandas | |
except ImportError: | |
raise ImportError('No pandas moduleis available and this' | |
'method cannot used') | |
rst = [] | |
for i, x in enumerate(self.readData()): | |
if i == 0: | |
colnames = x | |
else: | |
rst.append(x) | |
df = pandas.DataFrame(data = rst, columns = colnames) | |
return df | |
class SASbatchexport: | |
def __init__(self, directory): | |
if directory is None or not isinstance(directory, str): | |
raise ValueError('SAS library path has to be specified') | |
self.directory = directory | |
def to_sqlitedb(self, dest=None): | |
"""Export all SAS data sets to a SQLite database""" | |
if dest is None or not isinstance(dest, str): | |
print 'The output SQLite db will be name as SASOUTPUT.db' | |
dest = 'SASOUTPUT.db' | |
s = self.directory + '/*.sas7bdat' | |
for sasfile in glob.glob(s): | |
_data = SASexport(sasfile) | |
_data.to_sqlite(dest) | |
print 'SAS dataset {} has been successfully exported'.format( \ | |
_data.header.dataset) | |
if __name__ == '__main__': | |
# Test the export feature | |
data = SASexport('d:/google/onedrive/class.sas7bdat') | |
data.head(30) | |
data.meta() | |
data.to_sqlite('d:/tmp/test3.db') | |
data = SASexport('d:/google/onedrive/prdsal2.sas7bdat') | |
data.head() | |
df = data.to_pandas() | |
print df | |
# Test the batch export feature | |
lib = SASbatchexport('d:/google/onedrive/') | |
lib.to_sqlitedb('d:/tmp/test.db') | |