#!/usr/bin/env python
#
# Allows SQL-like querying of a Comma Separated Value (CSV) file.
# 
# Sample CSV:
#
# NAME,AGE,BIRTH,COUNTRY,GPA
# Dave,26,1979-02-12,United States,3.75
#
# csvq.py -w 'int(AGE) > 18' foo.csv
# csvq.py -w 'date(BIRTH).year == 1979' foo.csv
# csvq.py -w 'COUNTRY in ("United States","Canada","Mexico")' foo.csv
# csvq.py -w 'round(float(GPA)) == 4.0' foo.csv
# csvq.py -s NAME,BIRTH -w 'date(BIRTH).month = 2' foo.csv
#
# (C)2006 David A. Riggs <david a riggs at gmail dot com>
# Licensed under the GNU GPLv2: http://www.gnu.org/licenses/gpl.html
#
# TODO handle non-streaming selects like 'distinct', 'count', 'max',
# or arbitrary Python transforms on the way out
# TODO flag to not print headers
#

import sys, csv

if sys.hexversion < 0x0203F0F0:
    # datetime module, CSV parser introduced in Python 2.3
    raise ImportError('Python 2.3 or higher required')

# imports for restricted eval environment
import time
import datetime as pydatetime

# Known date/datetime formats that we'll try if no format is specified
date_formats = ('%Y-%m-%d', '%Y/%m/%d')  # no, I will not try ambiguous date formats, eg. 
datetime_formats = ('%Y-%m-%dT%H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M')


### Internal helper functions

def _parse_date(s, fmt, default_fmts):
    '''Convert some string date/datetime value to a DateTime object, if no format specified, try to guess'''
    if not s: return None
    if fmt:
        return pydatetime.datetime(*(time.strptime(s, fmt)[0:6]))
    # brute-force try all known formats
    # TODO: be smarter about this with pattern matching
    s = s.strip()
    for fmt in default_fmts:
        try:
            return pydatetime.datetime(*(time.strptime(s, fmt)[0:6]))
        except ValueError:
            continue
    raise ValueError('Unable to parse date/datetime value: '+s)


### Externally-available helper functions

def date(s, fmt=None):
    '''Convert some string date value to a DateTime object, if no format specified, try to guess'''
    return _parse_date(s, fmt, date_formats)


def datetime(s, fmt=None):
    '''Convert some string date-time value to a DateTime object, if no format specified, try to guess'''
    return _parse_date(s, fmt, datetime_formats)


### Internal public API

class CsvQuery:
    '''actual CSV querying engine'''
    def __init__(self, delim=',', headers=None, select=None, where=None):
        self.delim = delim
        self.where = where
        self.headers = headers  # may be None!
        if select:
            self.select = select
        else:
            self.select = headers
        self.where_environment = globals() # TODO restrict
    
    def process(self, infile, outfile):
        '''perform the actual CSV querying process'''
        self._hack_headers(infile)
        print >> outfile, self.delim.join(self.select)
        reader = csv.DictReader(infile, self.headers, delimiter=self.delim)
        for row in reader:
            try:
                if self._where_clause(row):
                    print >> outfile, self._select_clause(row)
            except Exception, e:   # TODO can we be more specific?
                print >> sys.stderr, 'Row failed: %s: %s' % (e, str(row))
    
    def _where_clause(self, row):
        if not self.where: return True
        return bool(eval(self.where, self.where_environment, row))  # TODO hax0r-proof
    
    def _select_clause(self, row):
        cols = []
        for col in self.select:
            val = row[col]
            if not val: val = ''
            cols.append(val)
        return self.delim.join(cols)

    def _hack_headers(self, infile):
        if self.headers: return
        # if nobody specified headers, we need to figure them out for default select
        # wouldn't be necessary if DictReader would just tell us
        self.headers = infile.readline().strip().split(self.delim)
        if not self.select:
            self.select = self.headers
        

def csvq(infile, outfile, delim=',', headers=None, select=None, where=None):
    '''perform CSV query - this may be reused outside the executable script'''
    CsvQuery(delim, headers, select, where).process(infile, outfile)


### Internal executable script guts

def _main_optparse():
    '''setup Python OptionParser, return tuple of (opts,args)'''
    from optparse import OptionParser
    parser = OptionParser(description='Query a CSV (Comma Separated Value) file in SQL fashion.',
                          version='%prog 0.1', usage='%prog [OPTIONS] [FILE]')
    parser.add_option('-d', '--delim', dest='delim', default=',',
                       help='Use specified delimiter, \\t for TAB, COMMA is default')
    parser.add_option('-H', '--headers', dest='headers', default=None,
                       help='Specify column headers, COMMA separated, otherwise use first row')
    parser.add_option('-s', '--select', dest='select', default=None,
                       help='Select clause, COMMA separated column names, otherwise select all columns')
    parser.add_option('-w', '--where', dest='where', default=None,
                       help='Where clause (int,float,date,datetime,bool,<,>,==) or any Python statement')
    return parser.parse_args()

def _main():
    '''executable script main'''
    opts, args = _main_optparse()

    # try mapping these specials, otherwise use literal value
    delims = {'\\t':'\t',}
    if opts.delim in delims.keys():
        opts.delim = delims[opts.delim]
        
    if args:
        infile = open(args[0],'rb')
    else:
        infile = sys.stdin
    
    outfile = sys.stdout
    
    headers = None
    if opts.headers:
        headers = opts.headers.split(',')
    
    if opts.select:
        select = opts.select.split(',')
    else:
        select = headers
    
    try:
        csvq(infile, outfile, opts.delim, headers, select, opts.where)
    except KeyboardInterrupt, ki:
        print >> sys.stderr, ki


if __name__ == '__main__':
    _main()

