#!/usr/bin/env python
"""Stand-alone testing code for inserting HTS read data into pygr and demonstrating
the slow speed of reading the data.

"""

import csv
import re
import itertools
import os.path
from collections import defaultdict

from pygr import seqdb
from pygr import cnestedlist
import pygr.Data

class attrdict(dict):
    """Use keys of a dictionary as attributes.
    
    """
    
    def __init__(self, *args, **kwargs):
        dict.__init__(self, *args, **kwargs)
        self.__dict__ = self

class Annot(attrdict):
    """Simple Annotation class using an attribute dictionary.
    
    """
    
    pass

def dictionarySubset(d, keep_keys):
    """Return dictionary of subset of keys.
    """
    
    return dict([(k, v) for k, v in d.iteritems() if k in keep_keys])

# Representations of negative strand
_reverse_repr = [-1, "-1", "-", "C"]

def makePygrSequence(chromosome, start, stop, orientation, genome):
    """Given location, return sequence slice.
    
    >>> import pygr.Data
    >>> g = pygr.Data.getResource("Bio.Seq.Genome.HUMAN.hg18")
    >>> makePygrSequence("chr1", 10, 20, 1, g)
    chr1[10:20]

    """
    
    tmp_seq = genome[chromosome][start: stop]
    if orientation in _reverse_repr:
        tmp_seq = -tmp_seq
    return tmp_seq

# preprocessed data parsers
class BaseParser(object):
    def parse(self):
        raise NotImplementedError
    def __iter__(self):
        return self.parse()

class ReadIndexer(object):
    """Index a read parser by a particular field to a dictionary.

    """
    
    def __init__(self, parser):
        self.parser = parser
        
    def index(self, field):
        """Return the reads indexed by field.
        
        """
        
        hits = defaultdict(list)
        
        for p in self.parser:
            readname = p[field]
            hits[readname].append(p)
        
        return hits

_bowtie_mismatches_regex = re.compile("(\d+):([ACGT])>([ACGT])")

def parseBowtieMismatches(mismatches):
    """Parse the mismatch field in bowtie records.

    >>> mismatches = ['21:A>T', '1:C>G']
    >>> parseBowtieMismatches(mismatches)
    [(21, 'A', 'T'), (1, 'C', 'G')]

    """
    
    parsed_mismatches = []
    
    for mismatch in mismatches:
        try:
            parsed_mismatch = _bowtie_mismatches_regex.match(mismatch)
            position, read_base, reference_base = parsed_mismatch.groups()
            parsed_mismatches.append((position, read_base, reference_base))
        except AttributeError:
            pass
        
    return parsed_mismatches

class BowtieParser(BaseParser):
    _integer_features = ['start']
    _header = ['readname', 'strand', 'chromosome', 'start', 'readsequence',
               'mappingquality', 'reserved', 'mismatches']

    def __init__(self, input_filename, delimiter="\t"):
        self.input_filename = input_filename
        csv.field_size_limit(1000000000)
        self.csv_reader = csv.DictReader(file(input_filename),
                                         fieldnames=self._header,
                                         delimiter=delimiter)
    
    def parse(self):
        try:
            for (num, record) in enumerate(self.csv_reader):
                # process features that should be ints
                for intfield in self._integer_features:
                    try:
                        record[intfield] = int(record[intfield])
                    except ValueError:
                        raise ValueError("%s" % intfield)
    
                record['stop'] = record['start'] + len(record['readsequence'])
                
                parsed_mismatches = parseBowtieMismatches(record['mismatches'].split(","))
                record['nummismatches'] = len(parsed_mismatches)
                record['mismatches'] = parsed_mismatches
                
                if record['strand'] in [0, "0", "+", "W"]:
                    record['orientation'] = 1
                    record['strand'] = 1
                else:
                    record['orientation'] = -1
                    record['strand'] = -1
                yield record
        except csv.Error:
            print >> sys.stderr, "Error on line number %s" % num

class BaseInserter:
    """Basic insertion class for taking annotation-type data and inserting it into pygr.
    
    """

    _name = "BaseInserter"

    def __init__(self, sequence_db, *args, **kwargs):
        """Sets up the sequence database (generally a genome) that the annotations are inserted to.
        
        """
        
        self.sequence_db = sequence_db

    def read(self, *args, **kwargs):
        """Responsible for parsing the data to self.annots.
        
        Data must have a start, stop, orientation, and id. The ID should be a reference
        to what part of the sequence database the annotation is on, generally the chromosome.
        
        Recommended to write a mixin class that has a read method for defining new Insertion classes.
        
        """
        
        raise NotImplementedError, "Must define your own read method"

    def annotationDB(self, unique_keys=None, maxCache=1e8, *args, **kwargs):
        """Builds an annotationDB for passing to the annotation mapper.
        
        unique_keys is the attribute to use as a unique identifier for the annotation.
        If None is given, then integer values are used.
        
        If the value of the unique_key for one annotation is UNIQUEKEY,
        it will show up as annotUNIQUEKEY[0:10], for example.
        
        """
        
        if unique_keys in self.sliceAttrDict:
            unique_keys = [x[unique_keys] for x in self.annots]
        else:
            unique_keys = range(len(self.annots))
        assert len(unique_keys) == len(self.annots)
           
        self.annots_to_build = dict(zip(unique_keys, self.annots))
        self.annotation_db = seqdb.AnnotationDB(self.annots_to_build, self.sequence_db,
                                                sliceAttrDict=self.sliceAttrDict,
                                                maxCache=maxCache, *args, **kwargs)

    def annotationMap(self, location, info="", mode='w', *args, **kwargs):
        """Make a pairwise alignment of the features to the genome.
        
        location is the physical disk location for the files.
        information is a string describing the data.
        mode tells where to put it ('w' for disk, 'memory' for in main memory)
        
        """
        
        self.annotations_map = cnestedlist.NLMSA(os.path.abspath(location),
                                                 mode=mode,
                                                 pairwiseMode=True,
                                                 use_virtual_lpo=True,
                                                 *args, **kwargs)
        
        for v in self.annotation_db.values():
            self.annotations_map.addAnnotation(v)
        
        self.annotations_map.build()
        self.annotations_map.__doc__ = info
        
        return self.annotations_map

    def addtoPygr(self, resource_string):
        """Add to pygr at the given resource_string.
        
        """
        
        pygr.Data.getResource.addResource(resource_string, self.annotations_map)
        pygr.Data.save()

    def insert(self, data, location, mode="w", resource_string=None, info="", *args, **kwargs):
        """Convenience class to make and insert NLMSA data into pygr.Data.
        
        This calls read, annotationDB, annotationMap, and possibly addtoPygr.
        
        Returns the NLMSA created by the annotationMap method which is inserted as a pygr resource.
        
        """

        self.read(data, *args, **kwargs)
        self.buildSliceAttrDict()
        self.annotationDB()
        A = self.annotationMap(location, mode=mode, info=info)
        if resource_string:
            self.addtoPygr(resource_string)
        return A

class DictionaryListInserter(BaseInserter):
    
    _name = "DictionaryListInserter"
    
    def read(self, data_list):
        """Expects a list of dictionaries.
        
        """
        
        self.annots = map(Annot, data_list)
        for  d in self.annots:
            
            d['start'] = int(d['start'])
            d['stop'] = int(d['stop'])
            
            try:
                if d['Strand'] in [1, "1", "+", "W"]:
                    d['orientation'] = 1
                elif d['Strand'] in [-1, "-1", "-", "C"]:
                    d['orientation'] = -1
                else:
                    d['orientation'] = 1
            except KeyError:
                d['orientation'] = 1

        self.fields = self.annots[0].keys()
        
    def buildSliceAttrDict(self, *args, **kwargs):
        self.sliceAttrDict = {}
        for key in self.fields:
            self.sliceAttrDict[key] = key  
        
        self.sliceAttrDict['id'] = 'chromosome'
        self.sliceAttrDict['orientation'] = 'orientation'
        self.sliceAttrDict['start'] = 'start'
        self.sliceAttrDict['stop'] = 'stop'

class BowtieUniqueReadInserter(DictionaryListInserter):
    """Inserts reads that mapped with bowtie to a unique location into pygr.

    """
    
    def read(self, filename):
        self.fields = ['start', 'stop', 'orientation',
                       'chromosome', 'count', 'readnames']
        
        # parser for the bowtie file
        B = BowtieParser(filename)
        
        # index data by readname to find reads that mapped to multiple locations
        R = ReadIndexer(B)
        I_readname = R.index("readname")
        
        # find the reads that mapped to single locations
        single_hits = itertools.ifilter(lambda x: len(x) == 1, I_readname.values())
        single_hits_list = itertools.imap(lambda x: x[0], single_hits)
            
        #index the reads by reference sequence to count the number
        # of reads at each location
        I_refsequence = defaultdict(list)
        for s in single_hits_list:
            seq = makePygrSequence(s['chromosome'], s['start'],
                                   s['stop'], s['orientation'],
                                   self.sequence_db)
            I_refsequence[seq].append(s)
        
        self.annots = [] #map(Annot, single_hits_list)
        
        # just store the count of the number of reads at a particular genomic position
        # and a list of the readnames that go there
        for k, v in I_refsequence.iteritems():
            readnames = map(lambda x: x['readname'], v)
            tmp = dictionarySubset(v[0], keep_keys=['start', 'stop',
                                                    'orientation',
                                                    'chromosome'])
            tmp['readnames'] = readnames
            tmp['count'] = len(v)
            
            self.annots.append(Annot(tmp))
    
    def buildAttrDict(self, *args, **kwargs):
        self.sliceAttrDict = {}
        
        for key in self.fields:
            self.sliceAttrDict[key] = key
        
        self.sliceAttrDict['id'] = 'chromosome'
        self.sliceAttrDict['start'] = 'start'
        self.sliceAttrDict['stop'] = 'stop'


def main():
    """Do everything that I would do to insert data into pygr that makes reading slow.
    
    Afterwards, here's my sample usage of this data and time to use it:
    In [1]: import pygr.Data
    In [2]: from pygr_nlmsa_test import Annot # how can I avoid this line?
    In [3]: import time
    In [4]: t = time.time() ; reads = pygr.Data.getResource('Bio.Annotation.Foo.bowtie_mapped_reads'); print time.time() - t
    47.9402289391

    """

    filename = "bowtie_mapped_reads.txt"
    resource_string = "Bio.Annotation.Foo.bowtie_mapped_reads"
    location = "/tmp/bowtie_mapped_reads"
    genome = pygr.Data.getResource("Bio.Seq.Genome.YEAST.sacCer")
    
    # add the read data
    Ins = BowtieUniqueReadInserter(genome)
    Amap = Ins.insert(filename, location=location,
                      mode="w", info="Testing bowtie insertion",
                      resource_string=resource_string)

if __name__ == "__main__":
    main()
