Source code for oscar

import lzf
# da4 doesn't have libgit2-dev to install pygit2 yet
# import pygit2
from tokyocabinet import hash as tch

import clickhouse_driver as clickhouse

from datetime import datetime, timedelta, tzinfo
import difflib
from functools import wraps
import hashlib
import os
import time
import warnings
import fnvhash


__version__ = '1.3.3'
__author__ = "Marat (@cmu.edu)"
__license__ = "GPL v3"

PATHS = {
    # data_type: (path, prefix_bit_length)
    # prefix length means that the data are split into 2**n files,
    # e.g. key is in 0..31 for prefix length of 5 bit.

    # The most critical: raw data for the initial storage, use in sweeps, 100TB da4+ backup
    'commit_sequential_idx': ('/da4_data/All.blobs/commit_{key}.idx', 7),
    'commit_sequential_bin': ('/da4_data/All.blobs/commit_{key}.bin', 7),
    'tree_sequential_idx': ('/da4_data/All.blobs/tree_{key}.idx', 7),
    'tree_sequential_bin': ('/da4_data/All.blobs/tree_{key}.bin', 7),
    
    'tag_data': ('/da4_data/All.blobs/tag_{key}.bin', 7),
    'commit_data': ('/da4_data/All.blobs/commit_{key}.bin', 7),
    'tree_data': ('/da4_data/All.blobs/tree_{key}.bin', 7),
    'blob_data': ('/da4_data/All.blobs/blob_{key}.bin', 7),

    # critical - random access to trees and commits on da4 - need to do offsets for the da3
    'commit_random': ('/fast/All.sha1c/commit_{key}.tch', 7),
    'tree_random': ('/fast/All.sha1c/tree_{key}.tch', 7),

    'blob_offset': ('/fast/All.sha1o/sha1.blob_{key}.tch', 7),
    'commit_offset': ('/fast/All.sha1o/sha1.commit_{key}.tch', 7),
    'tree_offset': ('/fast/All.sha1o/sha1.tree_{key}.tch', 7),
    # the rest of x_data is currently unused:
    # 'commit_data': ('/data/All.blobs/commit_{key}.bin',  # 7)
    # 'tree_data': ('/data/All.blobs/tree_{key}.bin', 7)
    # 'tag_data': ('/data/All.blobs/tag_{key}.bin', 7)

    # relations - good to have but not critical
  
    # move to current version R as they get updated
    'commit_projects': ('/da0_data/basemaps/c2pFull{ver}.{key}.tch', 5),
    'commit_children': ('/da0_data/basemaps/c2ccFull{ver}.{key}.tch', 5),
    'commit_time_author': ('/da0_data/basemaps/c2taFull{ver}.{key}.tch', 5),
    'commit_root': ('/da0_data/basemaps/c2rFull{ver}.{key}.tch', 5),
    'commit_parent': ('/da0_data/basemaps/c2pcFull{ver}.{key}.tch', 5),
    'author_commits': ('/da0_data/basemaps/a2cFull{ver}.{key}.tch', 5),
    'author_projects': ('/da0_data/basemaps/a2pFull{ver}.{key}.tch', 5),
    'author_files': ('/da0_data/basemaps/a2fFull{ver}.{key}.tch', 5),
    'project_authors': ('/da0_data/basemaps/p2aFull{ver}.{key}.tch', 5),

    'commit_head': ('/da0_data/basemaps/c2hFull{ver}.{key}.tch', 5),
    'commit_blobs': ('/da0_data/basemaps/c2bFull{ver}.{key}.tch', 5),
    'commit_files': ('/da0_data/basemaps/c2fFull{ver}.{key}.tch', 5),
    'project_commits': ('/da0_data/basemaps/p2cFull{ver}.{key}.tch', 5),
    'blob_commits': ('/da0_data/basemaps/b2cFull{ver}.{key}.tch', 5),
    'blob_authors': ('/da0_data/basemaps/b2aFull{ver}.{key}.tch', 5),
    'file_authors': ('/da0_data/basemaps/f2aFull{ver}.{key}.tch', 5),
    'file_commits': ('/da0_data/basemaps/f2cFull{ver}.{key}.tch', 5),
    'file_blobs': ('/da0_data/basemaps/f2bFull{ver}.{key}.tch', 5),
    'blob_files': ('/da0_data/basemaps/b2fFull{ver}.{key}.tch', 5),

    'author_trpath':('/da0_data/basemaps/a2trp{ver}.tch', 5),

    # another way to get commit parents, currently unused
    # 'commit_parents': ('/da0_data/basemaps/c2pcK.{key}.tch', 7)

    # SHA1 cache, currently only on da4, da5  668G
    'blob_index_line': ('/fast/All.sha1/sha1.blob_{key}.tch', 7),
    'tree_index_line': ('/fast/All.sha1/sha1.tree_{key}.tch', 7),
    'commit_index_line': ('/fast/All.sha1/sha1.commit_{key}.tch', 7),
    'tag_index_line': ('/fast/All.sha1/sha1.tag_{key}.tch', 7)
}


def read_env_var():
    global PATHS
    all_blobs = [
        'commit_sequential_idx', 'commit_sequential_bin', 'tree_sequential_idx',
        'tree_sequential_bin', 'tag_data', 'commit_data', 'tree_data', 'blob_data'
    ]
    all_sha1c = [
        'commit_random', 'tree_random'
    ]
    all_sha1o = [
        'blob_offset', 'commit_offset', 'tree_offset'
    ]
    basemaps = [
        'commit_projects', 'commit_children', 'commit_time_author', 'commit_root',
        'commit_parent', 'author_commits', 'author_projects', 'project_authors',
        'commit_head', 'commit_blobs', 'commit_files', 'project_commits', 'blob_commits',
        'blob_authors', 'file_commits', 'file_blobs', 'blob_files', 'author_trpath',
        'author_files', 'file_authors'
    ]    
    all_sha1 = [
        'blob_index_line', 'tree_index_line', 'commit_index_line', 'tag_index_line'
    ]

    # This map maps the environment variable name to the key names in the PATHS global variable
    # For example, environment variable 'OSCAR_BASEMAPS' will contain the directory to find all the basemaps
    # whoes PATHS key matches the elements in the basemaps array
    # unless overwrote by each specific basemaps
    general_name_map = {
        'OSCAR_ALL_BLOBS': all_blobs,
        'OSCAR_ALL_SHA1C': all_sha1c,
        'OSCAR_ALL_SHA1O': all_sha1o,
        'OSCAR_BASEMAPS': basemaps,
        'OSCAR_ALL_SHA1': all_sha1
    }
    # This maps the environment variable name to the key names in the PATHS global variable
    # Each key in the PAHTS will have 'OSCAR_' prepended to the beginning
    # For example: OSCAR_COMMIT_DATA environment variable corresponds to PATHS['commit_data']
    specific_names = {'_'.join(['OSCAR', name.upper()]): name for name in PATHS.keys()}
    ver_names = {'_'.join(['OSCAR', name.upper(), 'VER']): name for name in basemaps}

    for v in os.environ.keys():
        if not os.environ[v]:
            continue
        # general directory config
        if v in general_name_map.keys():
            for name in general_name_map[v]:
                f = os.path.basename(PATHS[name][0])
                PATHS[name] = (os.path.join(os.environ[v], f), PATHS[name][1])
        # specific directory config overwrites general
        elif v in specific_names.keys():
            f = os.path.basename(PATHS[name][0])
            PATHS[specific_names[v]] = (os.path.join(os.environ[v],f), PATHS[specific_names[v]][1])
        # specific version config
        elif v in ver_names.keys():
            PATHS[ver_names[v]] = (
                PATHS[ver_names[v]][0].format(ver=os.environ[v], key='{key}'),
                PATHS[ver_names[v]][1]
            )
        # general version config
        elif v == "OSCAR_BASEMAPS_VER":
            for name in basemaps:
                if '{ver}' in PATHS[name][0]:
                    PATHS[name] = (PATHS[name][0].format(ver=os.environ[v], key='{key}'), PATHS[name][1])

    # if version not set, default to version R
    for key in PATHS.keys():
        if '{ver}' in PATHS[key][0]:
            PATHS[key] = (PATHS[key][0].format(ver='R', key='{key}'), PATHS[key][1])

read_env_var()

class ObjectNotFound(KeyError):
    pass


def unber(s):
    # type: (str) -> list
    r""" Perl BER unpacking
    Format definition: from http://perldoc.perl.org/functions/pack.html
        (see "w" template description)

    BER is a way to pack several variable-length ints into one
    binary string. Here we do the reverse

    :param s: a binary string with packed values
    :return: a list of unpacked values

    >>> unber('\x00\x83M')
    [0, 461]
    >>> unber('\x83M\x96\x14')
    [461, 2836]
    >>> unber('\x99a\x89\x12')
    [3297, 1170]
    """
    res = []
    acc = 0
    for char in s:
        b = ord(char)
        acc = (acc << 7) + (b & 0x7f)
        if not b & 0x80:
            res.append(acc)
            acc = 0
    return res


def lzf_length(raw_data):
    # type: (str) -> (int, int)
    r""" Get length of uncompressed data from a header of Compress::LZF
    output. Check Compress::LZF sources for the definition of this bit magic
        (namely, LZF.xs, decompress_sv)

    :param raw_data: data compressed with Perl Compress::LZF
    :return: tuple of (header_size, uncompressed_content_length) in bytes
    >>> lzf_length('\xc4\x9b')
    (2, 283)
    >>> lzf_length('\xc3\xa4')
    (2, 228)
    >>> lzf_length('\xc3\x8a')
    (2, 202)
    >>> lzf_length('\xca\x87')
    (2, 647)
    >>> lzf_length('\xe1\xaf\xa9')
    (3, 7145)
    >>> lzf_length('\xe0\xa7\x9c')
    (3, 2524)
    """
    if not raw_data:
        raise ValueError("LZF compressed data are missing header")
    lower = ord(raw_data[0])
    csize = len(raw_data)
    start = 1
    mask = 0x80
    while mask and csize > start and (lower & mask):
        mask >>= 1 + (mask == 0x80)
        start += 1
    if not mask or csize < start:
        raise ValueError("LZF compressed data header is corrupted")
    usize = lower & (mask - 1)
    for i in range(1, start):
        usize = (usize << 6) + (ord(raw_data[i]) & 0x3f)
    if not usize:
        raise ValueError("LZF compressed data header is corrupted")
    return start, usize


def decomp(raw_data):
    # type: (str) -> str
    """ lzf wrapper to handle perl tweaks in Compress::LZF
    This function extracts uncompressed size header
    and then does usual lzf decompression.
    Please check Compress::LZF sources for the definition of this bit magic

    :param raw_data: data compressed with Perl Compress::LZF
    :return: string of unpacked data
    """
    if not raw_data:
        return ""
    elif raw_data[0] == '\x00':
        return raw_data[1:]
    start, usize = lzf_length(raw_data)
    return lzf.decompress(raw_data[start:], usize)


def cached_property(func):
    """ Classic memoize with @property on top"""
    @wraps(func)
    def wrapper(self):
        key = "_" + func.__name__
        if not hasattr(self, key):
            setattr(self, key, func(self))
        return getattr(self, key)
    return property(wrapper)


def slice20(raw_data):
    """ Slice raw_data into 20-byte chunks and hex encode each of them
    """
    if raw_data is None:
        return ()

    return tuple(raw_data[i:i + 20].encode('hex')
                 for i in range(0, len(raw_data), 20))

class CommitTimezone(tzinfo):
    # a lightweight version of pytz._FixedOffset
    def __init__(self, hours, minutes):
        self.offset = timedelta(hours=hours, minutes=minutes)

    def utcoffset(self, dt):
        return self.offset

    def tzname(self, dt):
        return 'fixed'

    def dst(self, dt):
        # daylight saving time - no info
        return timedelta(0)

    def __repr__(self):
        h, m = divmod(self.offset.seconds // 60, 60)
        return "<Timezone: %02d:%02d>" % (h, m)


DAY_Z = datetime.fromtimestamp(0, CommitTimezone(0, 0))


def parse_commit_date(timestamp):
    """ Parse date string of authored_at/commited_at

    git log time is in the original timezone
        gitpython - same as git log (also, it has the correct timezone)
    unix timestamps (used internally by commit objects) are in UTC
        datetime.fromtimestamp without a timezone will convert it to host tz
    github api is in UTC (this is what trailing 'Z' means)

    :param timestamp: Commit.authored_at or Commit.commited_at,
        e.g. '1337145807 +1100'
    :type timestamp: str
    :return: UTC datetime
    :rtype: datetime.datetime or None

    >>> parse_commit_date('1337145807 +1100')
    datetime.datetime(2012, 5, 16, 16, 23, 27, tzinfo=<Timezone: 11:00>)
    >>> parse_commit_date('3337145807 +1100') is None
    True
    """
    ts, tz = timestamp.split()
    sign = -1 if tz.startswith('-') else 1
    try:
        ts = int(ts)
        hours, minutes = sign * int(tz[-4:-2]), sign * int(tz[-2])
        dt = datetime.fromtimestamp(ts, CommitTimezone(hours, minutes))
    except ValueError:
        # i.e. if timestamp or timezone is invalid
        return None

    # timestamp is in the future
    if ts > time.time():
        return None

    return dt


# Pool of open TokyoCabinet databases to save few milliseconds on opening
_TCH_POOL = {}


def _get_tch(path):
    if not path.endswith('.tch'):
        path += '.tch'
    if path not in _TCH_POOL:
        _TCH_POOL[path] = tch.Hash()
        _TCH_POOL[path].open(path, tch.HDBOREADER | tch.HDBONOLCK)
        # _TCH_POOL[path].setmutex()
    return _TCH_POOL[path]


def read_tch(path, key, silent=False):
    """ Read a value from a Tokyo Cabinet file by the specified key
    Main purpose of this method is to cached open .tch handlers
    in _TCH_POOL to speedup reads
    """

    try:
        return _get_tch(path)[key]
    except:
      return None
        #raise IOError("Tokyocabinet file " + path + " not found")
    #except KeyError:
     #   if silent:
     #       return ''
     #   raise ObjectNotFound(path + " " + key)

def tch_keys(path, key_prefix=''):
    return _get_tch(path).fwmkeys(key_prefix)


def resolve_path(dtype, object_key, use_fnv=False):
    # type: (str, str, bool) -> str
    """ Get path to a file using data type and object key (for sharding) """
    path, prefix_length = PATHS[dtype]

    p = fnvhash.fnv1a_32(object_key) if use_fnv else ord(object_key[0])
    prefix = p & (2**prefix_length - 1)
    return path.format(key=prefix)


class _Base(object):
    type = None
    key = None
    # fnv keys are used for non-git objects, such as files, projects and authors
    use_fnv_keys = True
    _keys_registry_dtype = None

    def __init__(self, key):
        """
        :param key: unique identifier for an object of this type
        """
        self.key = key

    def __repr__(self):
        return "<%s: %s>" % ((self.type or 'OscarBase').capitalize(), self.key)

    def __hash__(self):
        return hash(self.key)

    def __eq__(self, other):
        """
        >>> sha = 'f2a7fcdc51450ab03cb364415f14e634fa69b62c'
        >>> Commit(sha) == Commit(sha)
        True
        >>> Commit(sha) == Blob(sha)
        False
        """
        return isinstance(other, type(self)) \
            and self.type == other.type \
            and self.key == other.key

    def __ne__(self, other):
        return not self == other

    def __str__(self):
        return self.key

    def resolve_path(self, dtype):
        return resolve_path(dtype, self.key, self.use_fnv_keys)

    def read_tch(self, dtype, silent=True):
        """ Resolve the path and read .tch"""
        return read_tch(self.resolve_path(dtype), self.key, silent)

[docs]    @classmethod
    def all(cls):
        """ Iterate all objects of the given type

        This might be useful to get a list of all projects, or a list of
        all file names.

        Returns:
            a generator of `Project` objects
        """
        if not cls._keys_registry_dtype:
            raise NotImplemented

        base_path, prefix_length = PATHS[cls._keys_registry_dtype]
        for file_prefix in range(2 ** prefix_length):
            tch_path = base_path.format(key=file_prefix)
            for key in tch_keys(tch_path):
                yield cls(key)


class GitObject(_Base):
    use_fnv_keys = False

    @classmethod
    def all(cls):
        """ Iterate ALL objects of this type (all projects, all times) """
        base_idx_path, prefix_length = PATHS[cls.type + '_sequential_idx']
        base_bin_path, prefix_length = PATHS[cls.type + '_sequential_bin']
        for key in range(2**prefix_length):
            idx_path = base_idx_path.format(key=key)
            bin_path = base_bin_path.format(key=key)
            datafile = open(bin_path)
            for line in open(idx_path):
                chunks = line.strip().split(";")
                if len(chunks) > 4:  # cls.type == "blob":
                    # usually, it's true for blobs;
                    # however, some blobs follow common pattern
                    offset, comp_length, full_length, sha = chunks[1:5]
                else:
                    offset, comp_length, sha = chunks[1:4]

                obj = cls(sha)
                obj._data = decomp(datafile.read(int(comp_length)))

                yield obj
            datafile.close()

    def __init__(self, sha):
        """
        :param sha: either a 40 char hex or a 20 bytes binary SHA1 hash
        >>> sha = '05cf84081b63cda822ee407e688269b494a642de'
        >>> GitObject(sha.decode('hex')).sha == sha
        True
        >>> GitObject(sha).bin_sha == sha.decode('hex')
        True
        """
        if len(sha) == 40:
            self.sha = sha
            self.bin_sha = sha.decode("hex")
        elif len(sha) == 20:
            self.sha = sha.encode("hex")
            self.bin_sha = sha
        else:
            raise ValueError("Invalid SHA1 hash: %s" % sha)
        self.key = self.sha
        super(GitObject, self).__init__(sha)

    def resolve_path(self, dtype):
        # overriding to use bin_sha instead of the key (which is sha)
        return resolve_path(dtype, self.bin_sha, self.use_fnv_keys)

    def read_tch(self, dtype, silent=True):
        """ Resolve the path and read .tch"""
        return read_tch(self.resolve_path(dtype), self.bin_sha, silent)

    @cached_property
    def data(self):
        if self.type not in ('commit', 'tree'):
            raise NotImplementedError
        # default implementation will only work for commits and trees
        return decomp(self.read_tch(self.type + '_random', silent=False))

    @classmethod
    def string_sha(cls, data):
        """Manually compute blob sha from its content passed as `data`.

        The main use case for this method is to identify source of a file.

        Blob SHA is computed from a string:
        "blob <file content length as str><null byte><file content>"

        # https://gist.github.com/masak/2415865
        Commit SHAs are computed in a similar way
        "commit <commit length as str><null byte><commit content>"

        note that commit content includes committed/authored date

        Args:
            data (str): content of the GitObject to get hash for

        Returns:
            str: 40-byte hex SHA1 hash
        """
        sha1 = hashlib.sha1()
        sha1.update("%s %d\x00" % (cls.type, len(data)))
        sha1.update(data)
        return sha1.hexdigest()

    @classmethod
    def file_sha(cls, path):
        buffsize = 1024 ** 2
        size = os.stat(path).st_size
        fh = open(path, 'rb')
        sha1 = hashlib.sha1()
        sha1.update("%s %d\x00" % (cls.type, size))
        while size > 0:
            data = fh.read(min(size, buffsize))
            if not data:
                return sha1.hexdigest()
            sha1.update(data)

    def __str__(self):
        """
        >>> print(Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c'))
        tree d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d
        parent 66acf0a046a02b48e0b32052a17f1e240c2d7356
        author Pavel Puchkin <neoascetic@gmail.com> 1375321509 +1100
        committer Pavel Puchkin <neoascetic@gmail.com> 1375321597 +1100
        <BLANKLINE>
        License changed :P
        <BLANKLINE>
        """
        return self.data


[docs]class Blob(GitObject):
    type = 'blob'

    def __len__(self):
        _, length = self.position
        return length

    @classmethod
    def string_sha(cls, data):
        """
        >>> Blob.string_sha('Hello world!')
        '6769dd60bdf536a83c9353272157893043e9f7d0'
        """
        # return pygit2.hash(data)
        return super(Blob, cls).string_sha(data)

    @classmethod
    def file_sha(cls, path):
        """Manually compute blob sha from a file content.

        Similar to string_sha
        >>> Blob.file_sha('LICENSE')
        '94a9ed024d3859793618152ea559a168bbcbb5e2'
        """
        # return pygit2.hashfile(path)
        return super(Blob, cls).file_sha(path)

    @cached_property
    def position(self):
        """ Get offset and length of the blob data in the storage """
        try:
            offset, length = unber(self.read_tch('blob_offset'))
        except ValueError:  # empty read -> value not found
            raise ObjectNotFound('Blob data not found (bad sha?)')
        return offset, length

    @cached_property
    def data(self):
        """ Content of the blob """
        offset, length = self.position
        # no caching here to stay thread-safe
        with open(self.resolve_path('blob_data'), 'rb') as fh:
            fh.seek(offset)
            return decomp(fh.read(length))

    @cached_property
    def commit_shas(self):
        """ SHAs of Commits in which this blob have been
        introduced or modified.

        **NOTE: commits removing this blob are not included**
        """
        return slice20(self.read_tch('blob_commits'))

    @property
    def commits(self):
        """ Commits where this blob has been added or changed

        **NOTE: commits removing this blob are not included**
        """
        return (Commit(bin_sha) for bin_sha in self.commit_shas)



[docs]class Tree(GitObject):
    """ A representation of git tree object, basically - a directory.

    Trees are iterable. Each element of the iteration is a 3-tuple:
    `(mode, filename, sha)`

    - `mode` is an ASCII decimal **string** similar to file mode
        in Unix systems. Subtrees always have mode "40000"
    - `filename` is a string filename, not including directories
    - `sha` is a 40 bytes hex string representing file content Blob SHA

    .. Note:: iteration is not recursive.
        For a recursive walk, use Tree.traverse() or Tree.files

    Both files and blobs can be checked for membership,
    either by their id (filename or SHA) or a corresponding object:

        >>> tree = Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d")
        >>> '.gitignore' in tree
        True
        >>> File('.keep') in tree
        False
        >>> '83d22195edc1473673f1bf35307aea6edf3c37e3' in tree
        True
        >>> Blob('83d22195edc1473673f1bf35307aea6edf3c37e3') in tree
        True

    `len(tree)` returns the number of files under the tree, including files in
    subtrees but not the subtrees themselves:

        >>> len(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d"))
        16

    """
    type = 'tree'

    def __iter__(self):
        """ Unpack binary tree structures, yielding 3-tuples of
        (mode (ASCII decimal), filename, sha (40 bytes hex))

        Format description:  https://stackoverflow.com/questions/14790681/
            mode   (ASCII encoded decimal)
            SPACE (\0x20)
            filename
            NULL (\x00)
            20-byte binary hash
        >>> len(list(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d")))
        6
        >>> all(len(line) == 3
        ...     for line in Tree("954829887af5d9071aa92c427133ca2cdd0813cc"))
        True
        """
        data = self.data

        i = 0
        while i < len(data):
            # mode
            start = i
            while i < len(data) and data[i] != " ":
                i += 1
            mode = data[start:i]
            i += 1
            # file name
            start = i
            while i < len(data) and data[i] != "\x00":
                i += 1
            fname = data[start:i]
            # sha
            start = i + 1
            i += 21
            yield mode, fname, data[start:i].encode('hex')

    def __len__(self):
        return len(self.files)

    def __contains__(self, item):
        if isinstance(item, File):
            return item.key in self.files
        elif isinstance(item, Blob):
            return item.sha in self.blob_shas
        elif not isinstance(item, str):
            return False

        return item in self.blob_shas or item in self.files

[docs]    def traverse(self):
        """ Recursively traverse the tree
        This will generate 3-tuples of the same format as direct tree
        iteration, but will recursively include subtrees content.

        :return: generator of (mode, filename, blob/tree sha)

        >>> c = Commit("1e971a073f40d74a1e72e07c682e1cba0bae159b")
        >>> len(list(c.tree.traverse()))
        8
        >>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
        >>> len(list(c.tree.traverse()))
        36
        """
        for mode, fname, sha in self:
            yield mode, fname, sha
            # trees are always 40000:
            # https://stackoverflow.com/questions/1071241
            if mode == "40000":
                for mode2, fname2, sha2 in Tree(sha).traverse():
                    yield mode2, fname + '/' + fname2, sha2

    @property
    def full(self):
        """ Formatted tree content, including recursive files and subtrees
        It is intended for debug purposes only.

        :return: multiline string, where each line contains mode, name and sha,
            with subtrees expanded
        """
        files = sorted(self.traverse(), key=lambda x: x[1])
        return "\n".join(" ".join(line) for line in files)

    def __str__(self):
        """
        >>> print(Tree("954829887af5d9071aa92c427133ca2cdd0813cc"))
        100644 __init__.py ff1f7925b77129b31938e76b5661f0a2c4500556
        100644 admin.py d05d461b48a8a5b5a9d1ea62b3815e089f3eb79b
        100644 models.py d1d952ee766d616eae5bfbd040c684007a424364
        40000 templates 7ff5e4c9bd3ce6ab500b754831d231022b58f689
        40000 templatetags e5e994b0be2c9ce6af6f753275e7d8c29ccf75ce
        100644 urls.py e9cb0c23a7f6683911305efff91dcabadb938794
        100644 utils.py 2cfbd298f18a75d1f0f51c2f6a1f2fcdf41a9559
        100644 views.py 973a78a1fe9e69d4d3b25c92b3889f7e91142439
        """
        return "\n".join(" ".join(line) for line in self)

    @cached_property
    def files(self):
        """ A dict of all files and their content/blob sha under this tree.
        It includes recursive files (i.e. files in subdirectories).
        It does NOT include subdirectories themselves.
        """
        return {fname: sha
                for mode, fname, sha in self.traverse() if mode != "40000"}

    @property
    def blob_shas(self):
        """A tuple of all file content shas, including files in subdirectories
        """
        return tuple(self.files.values())

    @property
    def blobs(self):
        """ A generator of Blob objects with file content.
        It does include files in subdirectories.

        >>> tuple(Tree('d20520ef8c1537a42628b72d481b8174c0a1de84').blobs
        ...       )  # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
        (<Blob: 2bdf5d686c6cd488b706be5c99c3bb1e166cf2f6>, ...,
         <Blob: c006bef767d08b41633b380058a171b7786b71ab>)
        """
        return (Blob(sha) for sha in self.blob_shas)


[docs]class Commit(GitObject):
    """ A git commit object.

    Commits have some special properties.
    Most of object properties provided by this project are lazy, i.e. they are
    computed when you access them for the first time.
    The following `Commit` properties will be instantiated all at once on the
    first access to *any* of them.

    - :data:`tree`:           root `Tree` of the commit
    - :data:`parent_shas`:    tuple of parent commit sha hashes
    - :data:`message`:        str, first line of the commit message
    - :data:`full_message`:   str, full commit message
    - :data:`author`:         str, Name <email>
    - :data:`authored_at`:    str, unix_epoch+timezone
    - :data:`committer`:      str, Name <email>
    - :data:`committed_at`:   str, unix_epoch+timezone
    """
    type = 'commit'

    def __getattr__(self, attr):
        """ Mimic special properties:
            tree:           root Tree of the commit
            parent_shas:    tuple of parent commit sha hashes
            message:        str, first line of the commit message
            full_message:   str, full commit message
            author:         str, Name <email>
            authored_at:    timezone-aware datetime or None (if invalid)
            committer:      str, Name <email>
            committed_at:   timezone-aware datetime or None (if invalid)
            signature:      str or None, PGP signature

        Commit: https://github.com/user2589/minicms/commit/e38126db
        >>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
        >>> c.author.startswith('Marat')
        True
        >>> c.authored_at
        datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=<Timezone: 11:00>)
        >>> c.tree.sha
        '6845f55f47ddfdbe4628a83fdaba35fa4ae3c894'
        >>> len(c.parent_shas)
        1
        >>> c.parent_shas[0]
        'ab124ab4baa42cd9f554b7bb038e19d4e3647957'
        >>> c.committed_at
        datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=<Timezone: 11:00>)
        """
        attrs = ('tree', 'parent_shas', 'message', 'full_message', 'author',
                 'committer', 'authored_at', 'committed_at', 'signature')
        if attr not in attrs:
            raise AttributeError

        for a in attrs:
            setattr(self, a, None)

        self.header, self.full_message = self.data.split("\n\n", 1)
        self.message = self.full_message.split("\n", 1)[0]
        parent_shas = []
        signature = None
        reading_signature = False
        for line in self.header.split("\n"):
            if reading_signature:
                # examples:
                #   1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab
                #   cbbc685c45bdff4da5ea0984f1dd3a73486b4556
                signature += line
                if line.strip() == "-----END PGP SIGNATURE-----":
                    self.signature = signature
                    reading_signature = False
                continue

            if line.startswith(" "):  # mergetag object, not supported (yet?)
                # example: c1313c68c7f784efaf700fbfb771065840fc260a
                continue

            line = line.strip()
            if not line:  # sometimes there is an empty line after gpgsig
                continue
            try:
                key, value = line.split(" ", 1)
            except ValueError:
                raise ValueError("Unexpected header in commit " + self.sha)

            if key == "tree":
                self.tree = Tree(value)
            elif key == "parent":  # multiple parents possible
                parent_shas.append(value)
            elif key == "author":
                # author name can have arbitrary number of spaces while
                # timestamp is guaranteed to have one, so rsplit
                chunks = value.rsplit(" ", 2)
                self.author = chunks[0]
                self.authored_at = parse_commit_date(" ".join(chunks[1:]))
            elif key == "committer":
                # same logic as author
                chunks = value.rsplit(" ", 2)
                self.committer = chunks[0]
                self.committed_at = parse_commit_date(" ".join(chunks[1:]))
            elif key == 'gpgsig':
                signature = value
                reading_signature = True
        self.parent_shas = tuple(parent_shas)

        return getattr(self, attr)

    def __sub__(self, parent, threshold=0.5):
        """ Compare two Commits.

        Args:
            parent (Commit): another commit to compare to.
                Expected order is `diff = child_commit - parent_commit`

        Returns:
            Generator[Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]]:
                4-tuples: `(old_path, new_path, old_sha, new_sha)`

            Examples:
            - a new file 'setup.py' was created:
                `(None, 'setup.py', None, 'file_sha')`
            - an existing 'setup.py' was deleted:
                `('setup.py', None, 'old_file_sha', None)`
            - setup.py.old was renamed to setup.py, content unchanged:
                `('setup.py.old', 'setup.py', 'file_sha', 'file_sha')`
            - setup.py was edited:
                `('setup.py', 'setup.py', 'old_file_sha', 'new_file_sha')`
            - setup.py.old was edited and renamed to setup.py:
                `('setup.py.old', 'setup.py', 'old_file_sha', 'new_file_sha')`

        Detecting the last one is computationally expensive. You can adjust this
        behaviour by passing the `threshold` parameter, which is 0.5 by default.
        It means that if roughly 50% of the file content is the same,
        it is considered a match. `threshold=1` means that only exact
        matches are considered, effectively disabling this comparison.
        If threshold is set to 0, any pair of deleted and added file will be
        considered renamed and edited; this last case doesn't make much sense so
        don't set it too low.
        """
        if parent.sha not in self.parent_shas:
            warnings.warn("Comparing non-adjacent commits might be "
                          "computationally expensive. Proceed with caution.")

        # filename: (blob sha before, blob sha after)
        new_files = self.tree.files
        new_paths = set(new_files.keys())
        old_files = parent.tree.files
        old_paths = set(old_files.keys())

        # unchanged_paths
        for fname in new_paths.intersection(old_paths):
            if new_files[fname] != old_files[fname]:
                # i.e. the Blob sha is the same
                yield (fname, fname, old_files[fname], new_files[fname])

        added_paths = new_paths - old_paths
        deleted_paths = old_paths - new_paths

        if threshold >= 1:  # i.e. only exact matches are considered
            for fname in added_paths:
                yield (None, fname, None, new_files[fname])
            for fname in deleted_paths:
                yield (fname, None, old_files[fname], None)
            return

        # search for matches
        sm = difflib.SequenceMatcher()
        added_blobs = {f: Blob(new_files[f]) for f in added_paths}
        deleted_blobs = {f: Blob(old_files[f]) for f in deleted_paths}
        # for each added blob, try to find a match in deleted blobs
        #   if there is a match, signal a rename and remove from deleted
        #   if there is no match, signal a new file
        # unused deleted blobs are indeed deleted
        for added_fname, added_blob in added_blobs.items():
            sm.set_seq1(added_blob)
            matched = False
            for deleted_fname, deleted_blob in deleted_blobs.items():
                sm.set_seq2(deleted_blob)
                # use quick checks first (lower bound by length diff)
                if sm.real_quick_ratio() > threshold \
                        and sm.quick_ratio() > threshold \
                        and sm.ratio() > threshold:
                    yield (deleted_fname, added_fname, deleted_blob, added_blob)
                    del(deleted_blobs[deleted_fname])
                    matched = True
                    break
            if not matched:  # this is a new file
                yield (None, added_fname, None, added_blob)

        for deleted_fname, deleted_blob in deleted_blobs.items():
            yield (deleted_fname, None, deleted_blob, None)

    @property
    def parents(self):
        """ A generator of parent commits.
        If you only need hashes (and not `Commit` objects),
        use `.parent_sha` instead

        Commit: https://github.com/user2589/minicms/commit/e38126db
        >>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
        >>> tuple(c.parents)
        (<Commit: ab124ab4baa42cd9f554b7bb038e19d4e3647957>,)
        """
        return (Commit(sha) for sha in self.parent_shas)

    @cached_property
    def project_names(self):
        # type: () -> tuple
        """ URIs of projects including this commit.
        This property can be used to find all forks of a project
        by its first commit.

        Commit: https://github.com/user2589/minicms/commit/f2a7fcdc
        >>> c = Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c')
        >>> isinstance(c.project_names, tuple)
        True
        >>> len(c.project_names) > 0
        True
        >>> 'user2589_minicms' in c.project_names
        True
        """
        data = decomp(self.read_tch('commit_projects'))
        return tuple(project_name
                     for project_name in (data and data.split(";")) or []
                     if project_name and project_name != 'EMPTY')

    @property
    def projects(self):
        """ A generator of `Project` s, in which this commit is included.
        """
        return (Project(uri) for uri in self.project_names)

    @cached_property
    def child_shas(self):
        """ Children commit binary sha hashes.
        Basically, this is a reverse parent_shas

        Commit: https://github.com/user2589/minicms/commit/1e971a07
        >>> Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').child_shas
        ('9bd02434b834979bb69d0b752a403228f2e385e8',)
        """
        return slice20(self.read_tch('commit_children'))

    @property
    def children(self):
        """ A generator of children `Commit` objects

        Commit: https://github.com/user2589/minicms/commit/1e971a07
        >>> tuple(Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').children)
        (<Commit: 9bd02434b834979bb69d0b752a403228f2e385e8>,)
        """
        return (Commit(sha) for sha in self.child_shas)

    @cached_property
    def blob_shas(self):
        """ SHA hashes of all blobs in the commit

        >>> Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blob_shas
        ...        # doctest: +NORMALIZE_WHITESPACE
        ('b2f49ffef1c8d7ce83a004b34035f917713e2766',
         'c92011c5ccc32a9248bd929a6e56f846ac5b8072',
         'bf3c2d2df2ef710f995b590ac3e2c851b592c871')
        """
        return self.tree.blob_shas

    @cached_property
    def changed_file_names(self):
        data = decomp(self.read_tch('commit_files'))
        return tuple((data and data.split(";")) or [])

    def files_changed(self):
        return (File(filename) for filename in self.changed_file_names)

    @property
    def blob_shas_rel(self):
        """
        This relation is known to miss every first file in all trees.
        Consider using Commit.tree.blobs as a slower but more accurate
        alternative.

        When this relation passes the test, please replace blob_sha with it
        It should be faster but as of now it is not accurate
        """
        warnings.warn(
            "This relation is known to miss every first file in all trees. "
            "Consider using Commit.tree.blobs as a slower but more accurate "
            "alternative", DeprecationWarning)
        return slice20(self.read_tch('commit_blobs'))

    @property
    def blobs(self):
        """ A generator of `Blob` objects included in this commit

        >>> tuple(Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blobs)
        ...              # doctest: +NORMALIZE_WHITESPACE
        (<Blob: b2f49ffef1c8d7ce83a004b34035f917713e2766>,
         <Blob: c92011c5ccc32a9248bd929a6e56f846ac5b8072>,
         <Blob: bf3c2d2df2ef710f995b590ac3e2c851b592c871>)
        """
        return (Blob(bin_sha) for bin_sha in self.blob_shas)

    @cached_property
    def files(self):
        data = decomp(self.read_tch('commit_files'))
        return tuple(file_name 
        for file_name in (data and data.split(";")) or [] if file_name and file_name != 'EMPTY')

class Commit_info(GitObject):
    @cached_property
    def time_author(self):
        data = self.read_tch('commit_time_author')
        return tuple(time_author 
        for time_author in (data and data.split(";")))

    @cached_property
    def head(self):
      data = slice20(self.read_tch('commit_head'))
      return data 

class Tag(GitObject):
    """ Tag doesn't have any functionality associated.
    You can't really do anything useful with it yet
    """
    type = 'tag'


[docs]class Project(_Base):
    """
    Projects are initialized with a URI:
        - Github: `{user}_{repo}`, e.g. `user2589_minicms`
        - Gitlab: `gl_{user}_{repo}`
        - Bitbucket: `bb_{user}_{repo}`
        - Bioconductor: `bioconductor.org_{user}_{repo}`
        - kde: `kde.org_{user}_{repo}`
        - drupal: `drupal.org_{user}_{repo}` 
        - Googlesouce: `android.googlesource.com_{repo}_{user}`
        - Linux kernel: `git.kernel.org_{user}_{repo}`
        - PostgreSQL: `git.postgresql.org_{user}_{repo}`
        - GNU Savannah: `git.savannah.gnu.org_{user}_{repo}`
        - ZX2C4: `git.zx2c4.com_{user}_{repo}`
        - GNOME: `gitlab.gnome.org_{user}_{repo}`
        - repo.or.cz: `repo.or.cz_{user}_{repo}`
        - Salsa: `salsa.debian.org_{user}_{repo}`
        - SourceForge: `sourceforge.net_{user}_{repo}`
  
    Projects are iterable:

        >>> for commit in Project('user2589_minicms'):  # doctest: +SKIP
        ...     print(commit.sha)

    Commits can be checked for membership in a project, either by their SHA
    hash or by a Commit object itself:

        Commit: https://github.com/user2589/minicms/commit/e38126db
        >>> sha = 'e38126dbca6572912013621d2aa9e6f7c50f36bc'
        >>> sha in Project('user2589_minicms')
        True
        >>> Commit(sha) in Project('user2589_minicms')
        True
    """

    type = 'project'
    _keys_registry_dtype = 'project_commits'

    def __init__(self, uri):
        self.uri = uri
        super(Project, self).__init__(uri)

    def __iter__(self):
        """ Generator of all commits in the project.
        Order of commits is not guaranteed

        >>> commits = tuple(Project('user2589_minicms'))
        >>> len(commits) > 60
        True
        >>> isinstance(commits[0], Commit)
        True
        """
        for sha in self.commit_shas:
            try:
                c = Commit(sha)
                author = c.author
            except ObjectNotFound:
                continue
            if author != 'GitHub Merge Button <merge-button@github.com>':
                yield c

    def __contains__(self, item):
        if isinstance(item, Commit):
            key = item.key
        elif isinstance(item, str):
            if len(item) == 20:
                key = item.encode('hex')
            elif len(item) == 40:
                key = item
            else:
                return False
        else:
            return False
        return key in self.commit_shas

    @cached_property
    def commit_shas(self):
        """ SHA1 of all commits in the project

        >>> Project('user2589_django-currencies').commit_shas
        ...         # doctest: +NORMALIZE_WHITESPACE
        ('2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7',
         '7572fc070c44f85e2a540f9a5a05a95d1dd2662d')
        """
        tch_path = self.resolve_path('project_commits')
        return slice20(read_tch(tch_path, self.key, silent=True))

    @property
    def commits(self):
        """ A generator of all Commit objects in the project.
        It has the same effect as iterating a `Project` instance itself,
        with some additional validation of commit dates.

        >>> tuple(Project('user2589_django-currencies').commits)
        ...       # doctest: +NORMALIZE_WHITESPACE
        (<Commit: 2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7>,
         <Commit: 7572fc070c44f85e2a540f9a5a05a95d1dd2662d>)
        """
        commits = tuple(c for c in self)
        tails = tuple(c for c in commits
                      if not c.parent_shas and c.authored_at is not None)
        if tails:
            min_date = min(c.authored_at for c in tails)
        else:  # i.e. if all tails have invalid date
            min_date = DAY_Z

        for c in commits:
            if c.authored_at and c.authored_at < min_date:
                c.authored_at = None
            yield c

    @cached_property
    def head(self):
        """ Get the HEAD commit of the repository

        >>> Project('user2589_minicms').head
        <Commit: f2a7fcdc51450ab03cb364415f14e634fa69b62c>
        >>> Project('RoseTHERESA_SimpleCMS').head
        <Commit: a47afa002ccfd3e23920f323b172f78c5c970250>
        """
        # Sometimes (very rarely) commit dates are wrong, so the latest commit
        # is not actually the head. The magic below is to account for this
        commits = {c.sha: c for c in self.commits}
        parents = set().union(*(c.parent_shas for c in commits.values()))
        heads = set(commits.keys()) - parents

        # it is possible that there is more than one head.
        # E.g. it happens when HEAD is moved manually (git reset)
        # and continued with a separate chain of commits.
        # in this case, let's just use the latest one
        # actually, storing refs would make it much simpler
        return sorted((commits[sha] for sha in heads),
                      key=lambda c: c.authored_at or DAY_Z)[-1]

    @cached_property
    def tail(self):
        """ Get the first commit SHA by following first parents

        >>> Project('user2589_minicms').tail
        '1e971a073f40d74a1e72e07c682e1cba0bae159b'
        """
        commits = {c.sha: c for c in self.commits}
        pts = set(c.parent_shas[0] for c in commits.values() if c.parent_shas)
        for sha, c in commits.items():
            if sha in pts and not c.parent_shas:
                return sha

    @property
    def commits_fp(self):
        """ Get a commit chain by following only the first parent, to mimic
        https://git-scm.com/docs/git-log#git-log---first-parent .
        Thus, you only get a small subset of the full commit tree:

        >>> p = Project('user2589_minicms')
        >>> set(c.sha for c in p.commits_fp).issubset(p.commit_shas)
        True

        In scenarios where branches are not important, it can save a lot
        of computing.

        Note: commits will come in order from the latest to the earliest.
        """
        # Simplified version of self.head():
        #   - slightly less precise,
        #   - 20% faster
        #
        # out of 500 randomly sampled projects, 493 had the same head.
        # In the remaining 7:
        #     2 had the same commit chain length,
        #     3 had one more commit
        #     1 had two more commits
        #     1 had three more commits
        # Execution time:
        #   simplified version (argmax): ~153 seconds
        #   self.head(): ~190 seconds

        # at this point we know all commits are in the dataset
        # (validated in __iter___)
        commits = {c.sha: c for c in self.commits}
        commit = max(commits.values(), key=lambda c: c.authored_at or DAY_Z)
        while commit:
            try:  # here there is no guarantee commit is in the dataset
                first_parent = commit.parent_shas and commit.parent_shas[0]
            except ObjectNotFound:
                break

            yield commit

            if not first_parent:
                break

            commit = commits.get(first_parent, Commit(first_parent))

    def toURL(self):
      '''
      Get the URL for a given project URI
      >>> Project('CS340-19_lectures').toURL()
      'http://github.com/CS340-19/lectures'
      '''
      p_name = self.uri
      found = False
      toUrlMap = {
        "bb": "bitbucket.org", "gl": "gitlab.org",
        "android.googlesource.com": "android.googlesource.com",
        "bioconductor.org": "bioconductor.org",
        "drupal.com": "git.drupal.org", "git.eclipse.org": "git.eclipse.org",
        "git.kernel.org": "git.kernel.org",
        "git.postgresql.org": "git.postgresql.org" ,
        "git.savannah.gnu.org": "git.savannah.gnu.org",
        "git.zx2c4.com": "git.zx2c4.com" ,
        "gitlab.gnome.org": "gitlab.gnome.org",
        "kde.org": "anongit.kde.org",
        "repo.or.cz": "repo.or.cz",
        "salsa.debian.org": "salsa.debian.org",
        "sourceforge.net": "git.code.sf.net/p"}

      for URL in toUrlMap.keys():
        URL_ = URL + "_"
        if p_name.startswith(URL_) and (p_name.count('_') >= 2 or URL == "sourceforge.net"):
          replacement = toUrlMap[URL] + "/"
          p_name = p_name.replace(URL_, replacement)
          found = True
          break

      if not found: 
        p_name = "github.com/" + p_name
 
      p_name = p_name.replace('_', '/', 1)
      return "https://" + p_name  
    
    @cached_property
    def author_names(self):
        data = decomp(self.read_tch('project_authors'))
        return tuple(author_name 
        for author_name in (data and data.split(";")) or [] if author_name and author_name != 'EMPTY')


[docs]class File(_Base):
    """
    Files are initialized with a path, starting from a commit root tree:

        >>> File('.gitignore')  # doctest: +SKIP
        >>> File('docs/Index.rst')  # doctest: +SKIP
    """
    type = 'file'
    _keys_registry_dtype = 'file_commits'

    def __init__(self, path):
        self.path = path
        super(File, self).__init__(path)

    @cached_property
    def authors(self):
        data = decomp(self.read_tch('file_authors'))
        return tuple(author for author in (data and data.split(";")))

    @cached_property
    def commit_shas(self):
        """ SHA1 of all commits changing this file

        **NOTE: this relation considers only diff with the first parent,
        which substantially limits its application**

        >>> commits = File('minicms/templatetags/minicms_tags.py').commit_shas
        >>> len(commits) > 0
        True
        >>> isinstance(commits, tuple)
        True
        >>> isinstance(commits[0], str)
        True
        >>> len(commits[0]) == 40
        True
        """
        file_path = self.key
        #if not file_path.endswith("\n"):
        #    file_path += "\n"
        tch_path = resolve_path('file_commits', file_path, self.use_fnv_keys)
        return slice20(read_tch(tch_path, file_path, silent=True))

    @property
    def commits(self):
        """ All commits changing the file

        .. note: this relation considers only diff with the first parent,
            which substantially limits its application

        >>> cs = tuple(File('minicms/templatetags/minicms_tags.py').commits)
        >>> len(cs) > 0
        True
        >>> isinstance(cs[0], Commit)
        True
        """
        for sha in self.commit_shas:
            c = Commit(sha)
            try:
                author = c.author
            except ObjectNotFound:
                continue
            if author != 'GitHub Merge Button <merge-button@github.com>':
                yield c

    def __str__(self):
        return super(File, self).__str__().rstrip("\n\r")


[docs]class Author(_Base):
    """
    Authors are initialized with a combination of name and email, as they
    appear in git configuration.

        >>> Author('John Doe <john.doe@aol.com>')  # doctest: +SKIP

    At this point we don't have a relation to map all aliases of the same
    author, so keep in mind this object represents an alias, not a person.
    """
    type = 'author'
    _keys_registry_dtype = 'author_commits'

    def __init__(self, full_email):
        self.full_email = full_email
        super(Author, self).__init__(full_email)

    @cached_property
    def commit_shas(self):
        """ SHA1 of all commits authored by the Author

        >>> commits = Author('user2589 <valiev.m@gmail.com>').commit_shas
        >>> len(commits) > 50
        True
        >>> isinstance(commits, tuple)
        True
        >>> isinstance(commits[0], str)
        True
        >>> len(commits[0]) == 40
        True
        """
        return slice20(self.read_tch('author_commits', silent=True))

    @property
    def commits(self):
        """ A generator of all Commit objects authored by the Author

        >>> commits = tuple(Author('user2589 <valiev.m@gmail.com>').commits)
        >>> len(commits) > 50
        True
        >>> isinstance(commits[0], Commit)
        True
        """
        return (Commit(sha) for sha in self.commit_shas)

    @cached_property
    def files(self):
        data = decomp(self.read_tch('author_files'))
        return tuple(file for file in (data and data.split(";")))
    
    @cached_property
    def project_names(self):
        """ URIs of projects where author has committed to 
A generator of all Commit objects authored by the Author
        """
        data = decomp(self.read_tch('author_projects'))
        return tuple(project_name
          for project_name in (data and data.split(";")) or [] if project_name and project_name != 'EMPTY')
    
    @cached_property
    def torvald(self):
      data = decomp(self.read_tch('author_trpath'))
      return tuple(path for path in (data and data.split(";")))

class Clickhouse_DB(object):
    ''' Clickhouse_DB class represents an instance of the clickhouse client
        It is initialized with a table name and a host name for the database
    '''
    def __init__(self, tb_name, db_host):
        self.tb_name = tb_name
        self.db_host = db_host
        self.client_settings = {'strings_as_bytes':True, 'max_block_size':100000}
        self.client = clickhouse.Client(host=self.db_host, settings=self.client_settings)

    def query(self, query_str):
        return self.client.execute(query_str)
    
    def query_iter(self, query_str):
        row_iter = self.client.execute_iter(query_str)
        for row in row_iter:
            yield row

    def query_select(self, s_col, s_from, s_start, s_end):
        # normal query
        s_where = self.__where_condition(s_start, s_end)
        query_str = 'select {} from {} where {}'.format(s_col, s_from, s_where)
        return self.client.execute(query_str)
        
    def query_select_iter(self, s_col, s_from, s_start, s_end):
        # iterative query
        s_where = self.__where_condition(s_start, s_end)
        query_str = 'select {} from {} where {}'.format(s_col, s_from, s_where)
        row_iter = self.client.execute_iter(query_str)
        for row in row_iter:
            yield row
    
    def __where_condition(self, start, end):
        # checks if start and end date or time is valid and build the where clause
        dt = 'time'
        if not self.__check_time(start, end):
            dt = 'date'
            start = 'toDate(\'{}\')'.format(start)
            end = 'toDate(\'{}\')'.format(end) if end else None
            
        if end is None:
            return '{}={}'.format(dt, start)
        else:
            return '{}>={}  AND {}<={}'.format(dt, start, dt, end)

    def __check_time(self, start, end):
        # make sure start and end are of the same type and must be either strings or ints
        if start is None:
            raise ValueError('start time cannot be None')
        elif not isinstance(start, int) and not isinstance(start, basestring):
            raise ValueError('start time must be either int or string')
        elif end is not None and not isinstance(end, int) and not isinstance(end, basestring):
            raise ValueError('end time must be either int or string')
        elif end is not None and type(start) is not type(end):
            raise ValueError('start and end must be of the same type')
        return (True if isinstance(start, int) else False)

class Time_commit_info(Clickhouse_DB):
    ''' Time_commit_info class is initialized with table name and database host name
        the default table for commits is commits_all, and the default host is localhost
        No connection is established before the query is made.
        The 'commits_all' table description is the following:
        |__name___|______type_______|
        | sha1    | FixedString(20) |
        | time    | Int32           |
        | tree    | FixedString(20) |
        | author  | String          |
        | parent  | String          |
        | comment | String          |
        | content | String          |
    '''
    columns = ['sha1', 'time', 'tree', 'author', 'parent', 'comment', 'content']

    def __init__(self, tb_name='commits_all', db_host='localhost'):
        super(Time_commit_info, self).__init__(tb_name, db_host)
    
    def commit_counts(self, start, end=None):
        ''' return the count of commits between given date and time
        >>> t = Time_commit_info()
        >>> t.commit_counts(1568656268)
        8
        '''
        rows = self.query_select('count(*)', self.tb_name, start, end)
        return rows[0][0]
    
    def commits_iter(self, start, end=None):
        ''' return a generator of Commit instances within a given date and time
        >>> t = Time_commit_info()
        >>> commits = t.commits_iter(1568656268)
        >>> c = commits.next()
        >>> type(c)
        <class 'oscar.Commit'>
        >>> c.parent_shas
        ('9c4cc4f6f8040ed98388c7dedeb683469f7210f5',)
        '''
        row_iter = self.query_select_iter('lower(hex(sha1))', self.tb_name, start, end)
        for row in row_iter:
            yield Commit(row[0])

    def commits_shas(self, start, end=None):
        ''' return a list of shas within the given time and date
        >>> t = Time_commit_info()
        >>> shas = t.commits_shas(1568656268)
        >>> type(shas)
        <type 'list'>
        '''
        rows = self.query_select('lower(hex(sha1))', self.tb_name, start, end)
        return [row[0] for row in rows]

    def commits_shas_iter(self, start, end=None):
        ''' return a generator of all sha1 within the given time and date
        >>> t = Time_commit_info()
        >>> for sha1 in t.commits_shas_iter(1568656268):
        ...     print(sha1)
        ''' 
        row_iter = self.query_select_iter('lower(hex(sha1))', self.tb_name, start, end)
        for row in row_iter:
            yield row[0]
        
class Time_project_info(Clickhouse_DB):
    ''' Time_project_info class is initialized with table name and database host name
        The default table name for projects is projects_all, and the default database name is localhost
        This class contains methods to query for project data
        The 'b2cPtaPkgR_all' table descrption is the following:
        |___name___|______type_______|
        | blob     | FixedString(20) |
        | commit   | FixedString(20) |
        | project  | String          |
        | time     | UInt32          |
        | author   | String          |
        | language | String          |
        | deps     | String          |
    '''
    columns = ['blob', 'commit', 'project', 'time', 'author', 'language', 'deps']

    def __init__(self, tb_name='b2cPtaPkgR_all', db_host='localhost'):
        super(Time_project_info, self).__init__(tb_name, db_host)
    
    def get_values_iter(self, cols, start, end):
        ''' return a generator for table rows for a given time interval                            
        >>> from oscar import Time_project_info as Proj
        >>> p = Proj()
        >>> rows = p.get_values_iter(['time','project'], 1568571909, 1568571910)
        >>> for row in rows:
        ...     print(row)
        ...
        (1568571909, 'mrtrevanderson_CECS_424')
        (1568571909, 'gitlab.com_surajpatel_tic_toc_toe')
        (1568571909, 'gitlab.com_surajpatel_tic_toc_toe')
        ...
        '''
        cols = self.__wrap_cols(cols)
        rows_iter = self.query_select_iter(', '.join(cols), self.tb_name, start, end)
        for row in rows_iter:
            yield row
    
    def project_timeline(self, cols, project):
        ''' return a generator for all rows given a project name (ordered by time)
        >>> rows = p.project_timeline(['time','project'], 'mrtrevanderson_CECS_424')
        >>> for row in rows:
        ...     print(row)
        ...
        (1568571909, 'mrtrevanderson_CECS_424')
        (1568571909, 'mrtrevanderson_CECS_424')
        (1568571909, 'mrtrevanderson_CECS_424')
        ...
        '''
        cols = self.__wrap_cols(cols)
        query_str = 'SELECT {} FROM {} WHERE project=\'{}\' ORDER BY time'\
                    .format(', '.join(cols), self.tb_name, project)
        rows_iter = self.query_iter(query_str)
        for row in rows_iter:
            yield row

    def author_timeline(self, cols, author):
        ''' return a generator for all rows given an author (ordered by time)
        >>> rows = p.author_timeline(['time', 'project'], 'Andrew Gacek <andrew.gacek@gmail.com>')
        >>> for row in rows:
        ...     print(row)
        ...
        (49, 'smaccm_camera_demo')
        (677, 'smaccm_vm_hack')
        (1180017188, 'teyjus_teyjus')
        ... 
        '''
        cols = self.__wrap_cols(cols)
        query_str = 'SELECT {} FROM {} WHERE author=\'{}\' ORDER BY time'\
                    .format(', '.join(cols), self.tb_name, author)
        rows_iter = self.query_iter(query_str)
        for row in rows_iter:
            yield row

    def __wrap_cols(self, cols):
        ''' wraps cols to select before querying
        '''
        for i in range(len(cols)):
            if cols[i] == 'commit' or cols[i] == 'blob':
                cols[i] = 'lower(hex({}))'.format(cols[i])
        return cols
Source code for oscar

oscar

Navigation

Related Topics