import lzf
# da4 doesn't have libgit2-dev to install pygit2 yet
# import pygit2
from tokyocabinet import hash as tch
import clickhouse_driver as clickhouse
from datetime import datetime, timedelta, tzinfo
import difflib
from functools import wraps
import hashlib
import os
import time
import warnings
import fnvhash
__version__ = '1.3.3'
__author__ = "Marat (@cmu.edu)"
__license__ = "GPL v3"
PATHS = {
# data_type: (path, prefix_bit_length)
# prefix length means that the data are split into 2**n files,
# e.g. key is in 0..31 for prefix length of 5 bit.
# The most critical: raw data for the initial storage, use in sweeps, 100TB da4+ backup
'commit_sequential_idx': ('/da4_data/All.blobs/commit_{key}.idx', 7),
'commit_sequential_bin': ('/da4_data/All.blobs/commit_{key}.bin', 7),
'tree_sequential_idx': ('/da4_data/All.blobs/tree_{key}.idx', 7),
'tree_sequential_bin': ('/da4_data/All.blobs/tree_{key}.bin', 7),
'tag_data': ('/da4_data/All.blobs/tag_{key}.bin', 7),
'commit_data': ('/da4_data/All.blobs/commit_{key}.bin', 7),
'tree_data': ('/da4_data/All.blobs/tree_{key}.bin', 7),
'blob_data': ('/da4_data/All.blobs/blob_{key}.bin', 7),
# critical - random access to trees and commits on da4 - need to do offsets for the da3
'commit_random': ('/fast/All.sha1c/commit_{key}.tch', 7),
'tree_random': ('/fast/All.sha1c/tree_{key}.tch', 7),
'blob_offset': ('/fast/All.sha1o/sha1.blob_{key}.tch', 7),
'commit_offset': ('/fast/All.sha1o/sha1.commit_{key}.tch', 7),
'tree_offset': ('/fast/All.sha1o/sha1.tree_{key}.tch', 7),
# the rest of x_data is currently unused:
# 'commit_data': ('/data/All.blobs/commit_{key}.bin', # 7)
# 'tree_data': ('/data/All.blobs/tree_{key}.bin', 7)
# 'tag_data': ('/data/All.blobs/tag_{key}.bin', 7)
# relations - good to have but not critical
# move to current version R as they get updated
'commit_projects': ('/da0_data/basemaps/c2pFull{ver}.{key}.tch', 5),
'commit_children': ('/da0_data/basemaps/c2ccFull{ver}.{key}.tch', 5),
'commit_time_author': ('/da0_data/basemaps/c2taFull{ver}.{key}.tch', 5),
'commit_root': ('/da0_data/basemaps/c2rFull{ver}.{key}.tch', 5),
'commit_parent': ('/da0_data/basemaps/c2pcFull{ver}.{key}.tch', 5),
'author_commits': ('/da0_data/basemaps/a2cFull{ver}.{key}.tch', 5),
'author_projects': ('/da0_data/basemaps/a2pFull{ver}.{key}.tch', 5),
'author_files': ('/da0_data/basemaps/a2fFull{ver}.{key}.tch', 5),
'project_authors': ('/da0_data/basemaps/p2aFull{ver}.{key}.tch', 5),
'commit_head': ('/da0_data/basemaps/c2hFull{ver}.{key}.tch', 5),
'commit_blobs': ('/da0_data/basemaps/c2bFull{ver}.{key}.tch', 5),
'commit_files': ('/da0_data/basemaps/c2fFull{ver}.{key}.tch', 5),
'project_commits': ('/da0_data/basemaps/p2cFull{ver}.{key}.tch', 5),
'blob_commits': ('/da0_data/basemaps/b2cFull{ver}.{key}.tch', 5),
'blob_authors': ('/da0_data/basemaps/b2aFull{ver}.{key}.tch', 5),
'file_authors': ('/da0_data/basemaps/f2aFull{ver}.{key}.tch', 5),
'file_commits': ('/da0_data/basemaps/f2cFull{ver}.{key}.tch', 5),
'file_blobs': ('/da0_data/basemaps/f2bFull{ver}.{key}.tch', 5),
'blob_files': ('/da0_data/basemaps/b2fFull{ver}.{key}.tch', 5),
'author_trpath':('/da0_data/basemaps/a2trp{ver}.tch', 5),
# another way to get commit parents, currently unused
# 'commit_parents': ('/da0_data/basemaps/c2pcK.{key}.tch', 7)
# SHA1 cache, currently only on da4, da5 668G
'blob_index_line': ('/fast/All.sha1/sha1.blob_{key}.tch', 7),
'tree_index_line': ('/fast/All.sha1/sha1.tree_{key}.tch', 7),
'commit_index_line': ('/fast/All.sha1/sha1.commit_{key}.tch', 7),
'tag_index_line': ('/fast/All.sha1/sha1.tag_{key}.tch', 7)
}
def read_env_var():
global PATHS
all_blobs = [
'commit_sequential_idx', 'commit_sequential_bin', 'tree_sequential_idx',
'tree_sequential_bin', 'tag_data', 'commit_data', 'tree_data', 'blob_data'
]
all_sha1c = [
'commit_random', 'tree_random'
]
all_sha1o = [
'blob_offset', 'commit_offset', 'tree_offset'
]
basemaps = [
'commit_projects', 'commit_children', 'commit_time_author', 'commit_root',
'commit_parent', 'author_commits', 'author_projects', 'project_authors',
'commit_head', 'commit_blobs', 'commit_files', 'project_commits', 'blob_commits',
'blob_authors', 'file_commits', 'file_blobs', 'blob_files', 'author_trpath',
'author_files', 'file_authors'
]
all_sha1 = [
'blob_index_line', 'tree_index_line', 'commit_index_line', 'tag_index_line'
]
# This map maps the environment variable name to the key names in the PATHS global variable
# For example, environment variable 'OSCAR_BASEMAPS' will contain the directory to find all the basemaps
# whoes PATHS key matches the elements in the basemaps array
# unless overwrote by each specific basemaps
general_name_map = {
'OSCAR_ALL_BLOBS': all_blobs,
'OSCAR_ALL_SHA1C': all_sha1c,
'OSCAR_ALL_SHA1O': all_sha1o,
'OSCAR_BASEMAPS': basemaps,
'OSCAR_ALL_SHA1': all_sha1
}
# This maps the environment variable name to the key names in the PATHS global variable
# Each key in the PAHTS will have 'OSCAR_' prepended to the beginning
# For example: OSCAR_COMMIT_DATA environment variable corresponds to PATHS['commit_data']
specific_names = {'_'.join(['OSCAR', name.upper()]): name for name in PATHS.keys()}
ver_names = {'_'.join(['OSCAR', name.upper(), 'VER']): name for name in basemaps}
for v in os.environ.keys():
if not os.environ[v]:
continue
# general directory config
if v in general_name_map.keys():
for name in general_name_map[v]:
f = os.path.basename(PATHS[name][0])
PATHS[name] = (os.path.join(os.environ[v], f), PATHS[name][1])
# specific directory config overwrites general
elif v in specific_names.keys():
f = os.path.basename(PATHS[name][0])
PATHS[specific_names[v]] = (os.path.join(os.environ[v],f), PATHS[specific_names[v]][1])
# specific version config
elif v in ver_names.keys():
PATHS[ver_names[v]] = (
PATHS[ver_names[v]][0].format(ver=os.environ[v], key='{key}'),
PATHS[ver_names[v]][1]
)
# general version config
elif v == "OSCAR_BASEMAPS_VER":
for name in basemaps:
if '{ver}' in PATHS[name][0]:
PATHS[name] = (PATHS[name][0].format(ver=os.environ[v], key='{key}'), PATHS[name][1])
# if version not set, default to version R
for key in PATHS.keys():
if '{ver}' in PATHS[key][0]:
PATHS[key] = (PATHS[key][0].format(ver='R', key='{key}'), PATHS[key][1])
read_env_var()
class ObjectNotFound(KeyError):
pass
def unber(s):
# type: (str) -> list
r""" Perl BER unpacking
Format definition: from http://perldoc.perl.org/functions/pack.html
(see "w" template description)
BER is a way to pack several variable-length ints into one
binary string. Here we do the reverse
:param s: a binary string with packed values
:return: a list of unpacked values
>>> unber('\x00\x83M')
[0, 461]
>>> unber('\x83M\x96\x14')
[461, 2836]
>>> unber('\x99a\x89\x12')
[3297, 1170]
"""
res = []
acc = 0
for char in s:
b = ord(char)
acc = (acc << 7) + (b & 0x7f)
if not b & 0x80:
res.append(acc)
acc = 0
return res
def lzf_length(raw_data):
# type: (str) -> (int, int)
r""" Get length of uncompressed data from a header of Compress::LZF
output. Check Compress::LZF sources for the definition of this bit magic
(namely, LZF.xs, decompress_sv)
:param raw_data: data compressed with Perl Compress::LZF
:return: tuple of (header_size, uncompressed_content_length) in bytes
>>> lzf_length('\xc4\x9b')
(2, 283)
>>> lzf_length('\xc3\xa4')
(2, 228)
>>> lzf_length('\xc3\x8a')
(2, 202)
>>> lzf_length('\xca\x87')
(2, 647)
>>> lzf_length('\xe1\xaf\xa9')
(3, 7145)
>>> lzf_length('\xe0\xa7\x9c')
(3, 2524)
"""
if not raw_data:
raise ValueError("LZF compressed data are missing header")
lower = ord(raw_data[0])
csize = len(raw_data)
start = 1
mask = 0x80
while mask and csize > start and (lower & mask):
mask >>= 1 + (mask == 0x80)
start += 1
if not mask or csize < start:
raise ValueError("LZF compressed data header is corrupted")
usize = lower & (mask - 1)
for i in range(1, start):
usize = (usize << 6) + (ord(raw_data[i]) & 0x3f)
if not usize:
raise ValueError("LZF compressed data header is corrupted")
return start, usize
def decomp(raw_data):
# type: (str) -> str
""" lzf wrapper to handle perl tweaks in Compress::LZF
This function extracts uncompressed size header
and then does usual lzf decompression.
Please check Compress::LZF sources for the definition of this bit magic
:param raw_data: data compressed with Perl Compress::LZF
:return: string of unpacked data
"""
if not raw_data:
return ""
elif raw_data[0] == '\x00':
return raw_data[1:]
start, usize = lzf_length(raw_data)
return lzf.decompress(raw_data[start:], usize)
def cached_property(func):
""" Classic memoize with @property on top"""
@wraps(func)
def wrapper(self):
key = "_" + func.__name__
if not hasattr(self, key):
setattr(self, key, func(self))
return getattr(self, key)
return property(wrapper)
def slice20(raw_data):
""" Slice raw_data into 20-byte chunks and hex encode each of them
"""
if raw_data is None:
return ()
return tuple(raw_data[i:i + 20].encode('hex')
for i in range(0, len(raw_data), 20))
class CommitTimezone(tzinfo):
# a lightweight version of pytz._FixedOffset
def __init__(self, hours, minutes):
self.offset = timedelta(hours=hours, minutes=minutes)
def utcoffset(self, dt):
return self.offset
def tzname(self, dt):
return 'fixed'
def dst(self, dt):
# daylight saving time - no info
return timedelta(0)
def __repr__(self):
h, m = divmod(self.offset.seconds // 60, 60)
return "<Timezone: %02d:%02d>" % (h, m)
DAY_Z = datetime.fromtimestamp(0, CommitTimezone(0, 0))
def parse_commit_date(timestamp):
""" Parse date string of authored_at/commited_at
git log time is in the original timezone
gitpython - same as git log (also, it has the correct timezone)
unix timestamps (used internally by commit objects) are in UTC
datetime.fromtimestamp without a timezone will convert it to host tz
github api is in UTC (this is what trailing 'Z' means)
:param timestamp: Commit.authored_at or Commit.commited_at,
e.g. '1337145807 +1100'
:type timestamp: str
:return: UTC datetime
:rtype: datetime.datetime or None
>>> parse_commit_date('1337145807 +1100')
datetime.datetime(2012, 5, 16, 16, 23, 27, tzinfo=<Timezone: 11:00>)
>>> parse_commit_date('3337145807 +1100') is None
True
"""
ts, tz = timestamp.split()
sign = -1 if tz.startswith('-') else 1
try:
ts = int(ts)
hours, minutes = sign * int(tz[-4:-2]), sign * int(tz[-2])
dt = datetime.fromtimestamp(ts, CommitTimezone(hours, minutes))
except ValueError:
# i.e. if timestamp or timezone is invalid
return None
# timestamp is in the future
if ts > time.time():
return None
return dt
# Pool of open TokyoCabinet databases to save few milliseconds on opening
_TCH_POOL = {}
def _get_tch(path):
if not path.endswith('.tch'):
path += '.tch'
if path not in _TCH_POOL:
_TCH_POOL[path] = tch.Hash()
_TCH_POOL[path].open(path, tch.HDBOREADER | tch.HDBONOLCK)
# _TCH_POOL[path].setmutex()
return _TCH_POOL[path]
def read_tch(path, key, silent=False):
""" Read a value from a Tokyo Cabinet file by the specified key
Main purpose of this method is to cached open .tch handlers
in _TCH_POOL to speedup reads
"""
try:
return _get_tch(path)[key]
except:
return None
#raise IOError("Tokyocabinet file " + path + " not found")
#except KeyError:
# if silent:
# return ''
# raise ObjectNotFound(path + " " + key)
def tch_keys(path, key_prefix=''):
return _get_tch(path).fwmkeys(key_prefix)
def resolve_path(dtype, object_key, use_fnv=False):
# type: (str, str, bool) -> str
""" Get path to a file using data type and object key (for sharding) """
path, prefix_length = PATHS[dtype]
p = fnvhash.fnv1a_32(object_key) if use_fnv else ord(object_key[0])
prefix = p & (2**prefix_length - 1)
return path.format(key=prefix)
class _Base(object):
type = None
key = None
# fnv keys are used for non-git objects, such as files, projects and authors
use_fnv_keys = True
_keys_registry_dtype = None
def __init__(self, key):
"""
:param key: unique identifier for an object of this type
"""
self.key = key
def __repr__(self):
return "<%s: %s>" % ((self.type or 'OscarBase').capitalize(), self.key)
def __hash__(self):
return hash(self.key)
def __eq__(self, other):
"""
>>> sha = 'f2a7fcdc51450ab03cb364415f14e634fa69b62c'
>>> Commit(sha) == Commit(sha)
True
>>> Commit(sha) == Blob(sha)
False
"""
return isinstance(other, type(self)) \
and self.type == other.type \
and self.key == other.key
def __ne__(self, other):
return not self == other
def __str__(self):
return self.key
def resolve_path(self, dtype):
return resolve_path(dtype, self.key, self.use_fnv_keys)
def read_tch(self, dtype, silent=True):
""" Resolve the path and read .tch"""
return read_tch(self.resolve_path(dtype), self.key, silent)
[docs] @classmethod
def all(cls):
""" Iterate all objects of the given type
This might be useful to get a list of all projects, or a list of
all file names.
Returns:
a generator of `Project` objects
"""
if not cls._keys_registry_dtype:
raise NotImplemented
base_path, prefix_length = PATHS[cls._keys_registry_dtype]
for file_prefix in range(2 ** prefix_length):
tch_path = base_path.format(key=file_prefix)
for key in tch_keys(tch_path):
yield cls(key)
class GitObject(_Base):
use_fnv_keys = False
@classmethod
def all(cls):
""" Iterate ALL objects of this type (all projects, all times) """
base_idx_path, prefix_length = PATHS[cls.type + '_sequential_idx']
base_bin_path, prefix_length = PATHS[cls.type + '_sequential_bin']
for key in range(2**prefix_length):
idx_path = base_idx_path.format(key=key)
bin_path = base_bin_path.format(key=key)
datafile = open(bin_path)
for line in open(idx_path):
chunks = line.strip().split(";")
if len(chunks) > 4: # cls.type == "blob":
# usually, it's true for blobs;
# however, some blobs follow common pattern
offset, comp_length, full_length, sha = chunks[1:5]
else:
offset, comp_length, sha = chunks[1:4]
obj = cls(sha)
obj._data = decomp(datafile.read(int(comp_length)))
yield obj
datafile.close()
def __init__(self, sha):
"""
:param sha: either a 40 char hex or a 20 bytes binary SHA1 hash
>>> sha = '05cf84081b63cda822ee407e688269b494a642de'
>>> GitObject(sha.decode('hex')).sha == sha
True
>>> GitObject(sha).bin_sha == sha.decode('hex')
True
"""
if len(sha) == 40:
self.sha = sha
self.bin_sha = sha.decode("hex")
elif len(sha) == 20:
self.sha = sha.encode("hex")
self.bin_sha = sha
else:
raise ValueError("Invalid SHA1 hash: %s" % sha)
self.key = self.sha
super(GitObject, self).__init__(sha)
def resolve_path(self, dtype):
# overriding to use bin_sha instead of the key (which is sha)
return resolve_path(dtype, self.bin_sha, self.use_fnv_keys)
def read_tch(self, dtype, silent=True):
""" Resolve the path and read .tch"""
return read_tch(self.resolve_path(dtype), self.bin_sha, silent)
@cached_property
def data(self):
if self.type not in ('commit', 'tree'):
raise NotImplementedError
# default implementation will only work for commits and trees
return decomp(self.read_tch(self.type + '_random', silent=False))
@classmethod
def string_sha(cls, data):
"""Manually compute blob sha from its content passed as `data`.
The main use case for this method is to identify source of a file.
Blob SHA is computed from a string:
"blob <file content length as str><null byte><file content>"
# https://gist.github.com/masak/2415865
Commit SHAs are computed in a similar way
"commit <commit length as str><null byte><commit content>"
note that commit content includes committed/authored date
Args:
data (str): content of the GitObject to get hash for
Returns:
str: 40-byte hex SHA1 hash
"""
sha1 = hashlib.sha1()
sha1.update("%s %d\x00" % (cls.type, len(data)))
sha1.update(data)
return sha1.hexdigest()
@classmethod
def file_sha(cls, path):
buffsize = 1024 ** 2
size = os.stat(path).st_size
fh = open(path, 'rb')
sha1 = hashlib.sha1()
sha1.update("%s %d\x00" % (cls.type, size))
while size > 0:
data = fh.read(min(size, buffsize))
if not data:
return sha1.hexdigest()
sha1.update(data)
def __str__(self):
"""
>>> print(Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c'))
tree d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d
parent 66acf0a046a02b48e0b32052a17f1e240c2d7356
author Pavel Puchkin <neoascetic@gmail.com> 1375321509 +1100
committer Pavel Puchkin <neoascetic@gmail.com> 1375321597 +1100
<BLANKLINE>
License changed :P
<BLANKLINE>
"""
return self.data
[docs]class Blob(GitObject):
type = 'blob'
def __len__(self):
_, length = self.position
return length
@classmethod
def string_sha(cls, data):
"""
>>> Blob.string_sha('Hello world!')
'6769dd60bdf536a83c9353272157893043e9f7d0'
"""
# return pygit2.hash(data)
return super(Blob, cls).string_sha(data)
@classmethod
def file_sha(cls, path):
"""Manually compute blob sha from a file content.
Similar to string_sha
>>> Blob.file_sha('LICENSE')
'94a9ed024d3859793618152ea559a168bbcbb5e2'
"""
# return pygit2.hashfile(path)
return super(Blob, cls).file_sha(path)
@cached_property
def position(self):
""" Get offset and length of the blob data in the storage """
try:
offset, length = unber(self.read_tch('blob_offset'))
except ValueError: # empty read -> value not found
raise ObjectNotFound('Blob data not found (bad sha?)')
return offset, length
@cached_property
def data(self):
""" Content of the blob """
offset, length = self.position
# no caching here to stay thread-safe
with open(self.resolve_path('blob_data'), 'rb') as fh:
fh.seek(offset)
return decomp(fh.read(length))
@cached_property
def commit_shas(self):
""" SHAs of Commits in which this blob have been
introduced or modified.
**NOTE: commits removing this blob are not included**
"""
return slice20(self.read_tch('blob_commits'))
@property
def commits(self):
""" Commits where this blob has been added or changed
**NOTE: commits removing this blob are not included**
"""
return (Commit(bin_sha) for bin_sha in self.commit_shas)
[docs]class Tree(GitObject):
""" A representation of git tree object, basically - a directory.
Trees are iterable. Each element of the iteration is a 3-tuple:
`(mode, filename, sha)`
- `mode` is an ASCII decimal **string** similar to file mode
in Unix systems. Subtrees always have mode "40000"
- `filename` is a string filename, not including directories
- `sha` is a 40 bytes hex string representing file content Blob SHA
.. Note:: iteration is not recursive.
For a recursive walk, use Tree.traverse() or Tree.files
Both files and blobs can be checked for membership,
either by their id (filename or SHA) or a corresponding object:
>>> tree = Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d")
>>> '.gitignore' in tree
True
>>> File('.keep') in tree
False
>>> '83d22195edc1473673f1bf35307aea6edf3c37e3' in tree
True
>>> Blob('83d22195edc1473673f1bf35307aea6edf3c37e3') in tree
True
`len(tree)` returns the number of files under the tree, including files in
subtrees but not the subtrees themselves:
>>> len(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d"))
16
"""
type = 'tree'
def __iter__(self):
""" Unpack binary tree structures, yielding 3-tuples of
(mode (ASCII decimal), filename, sha (40 bytes hex))
Format description: https://stackoverflow.com/questions/14790681/
mode (ASCII encoded decimal)
SPACE (\0x20)
filename
NULL (\x00)
20-byte binary hash
>>> len(list(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d")))
6
>>> all(len(line) == 3
... for line in Tree("954829887af5d9071aa92c427133ca2cdd0813cc"))
True
"""
data = self.data
i = 0
while i < len(data):
# mode
start = i
while i < len(data) and data[i] != " ":
i += 1
mode = data[start:i]
i += 1
# file name
start = i
while i < len(data) and data[i] != "\x00":
i += 1
fname = data[start:i]
# sha
start = i + 1
i += 21
yield mode, fname, data[start:i].encode('hex')
def __len__(self):
return len(self.files)
def __contains__(self, item):
if isinstance(item, File):
return item.key in self.files
elif isinstance(item, Blob):
return item.sha in self.blob_shas
elif not isinstance(item, str):
return False
return item in self.blob_shas or item in self.files
[docs] def traverse(self):
""" Recursively traverse the tree
This will generate 3-tuples of the same format as direct tree
iteration, but will recursively include subtrees content.
:return: generator of (mode, filename, blob/tree sha)
>>> c = Commit("1e971a073f40d74a1e72e07c682e1cba0bae159b")
>>> len(list(c.tree.traverse()))
8
>>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
>>> len(list(c.tree.traverse()))
36
"""
for mode, fname, sha in self:
yield mode, fname, sha
# trees are always 40000:
# https://stackoverflow.com/questions/1071241
if mode == "40000":
for mode2, fname2, sha2 in Tree(sha).traverse():
yield mode2, fname + '/' + fname2, sha2
@property
def full(self):
""" Formatted tree content, including recursive files and subtrees
It is intended for debug purposes only.
:return: multiline string, where each line contains mode, name and sha,
with subtrees expanded
"""
files = sorted(self.traverse(), key=lambda x: x[1])
return "\n".join(" ".join(line) for line in files)
def __str__(self):
"""
>>> print(Tree("954829887af5d9071aa92c427133ca2cdd0813cc"))
100644 __init__.py ff1f7925b77129b31938e76b5661f0a2c4500556
100644 admin.py d05d461b48a8a5b5a9d1ea62b3815e089f3eb79b
100644 models.py d1d952ee766d616eae5bfbd040c684007a424364
40000 templates 7ff5e4c9bd3ce6ab500b754831d231022b58f689
40000 templatetags e5e994b0be2c9ce6af6f753275e7d8c29ccf75ce
100644 urls.py e9cb0c23a7f6683911305efff91dcabadb938794
100644 utils.py 2cfbd298f18a75d1f0f51c2f6a1f2fcdf41a9559
100644 views.py 973a78a1fe9e69d4d3b25c92b3889f7e91142439
"""
return "\n".join(" ".join(line) for line in self)
@cached_property
def files(self):
""" A dict of all files and their content/blob sha under this tree.
It includes recursive files (i.e. files in subdirectories).
It does NOT include subdirectories themselves.
"""
return {fname: sha
for mode, fname, sha in self.traverse() if mode != "40000"}
@property
def blob_shas(self):
"""A tuple of all file content shas, including files in subdirectories
"""
return tuple(self.files.values())
@property
def blobs(self):
""" A generator of Blob objects with file content.
It does include files in subdirectories.
>>> tuple(Tree('d20520ef8c1537a42628b72d481b8174c0a1de84').blobs
... ) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
(<Blob: 2bdf5d686c6cd488b706be5c99c3bb1e166cf2f6>, ...,
<Blob: c006bef767d08b41633b380058a171b7786b71ab>)
"""
return (Blob(sha) for sha in self.blob_shas)
[docs]class Commit(GitObject):
""" A git commit object.
Commits have some special properties.
Most of object properties provided by this project are lazy, i.e. they are
computed when you access them for the first time.
The following `Commit` properties will be instantiated all at once on the
first access to *any* of them.
- :data:`tree`: root `Tree` of the commit
- :data:`parent_shas`: tuple of parent commit sha hashes
- :data:`message`: str, first line of the commit message
- :data:`full_message`: str, full commit message
- :data:`author`: str, Name <email>
- :data:`authored_at`: str, unix_epoch+timezone
- :data:`committer`: str, Name <email>
- :data:`committed_at`: str, unix_epoch+timezone
"""
type = 'commit'
def __getattr__(self, attr):
""" Mimic special properties:
tree: root Tree of the commit
parent_shas: tuple of parent commit sha hashes
message: str, first line of the commit message
full_message: str, full commit message
author: str, Name <email>
authored_at: timezone-aware datetime or None (if invalid)
committer: str, Name <email>
committed_at: timezone-aware datetime or None (if invalid)
signature: str or None, PGP signature
Commit: https://github.com/user2589/minicms/commit/e38126db
>>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
>>> c.author.startswith('Marat')
True
>>> c.authored_at
datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=<Timezone: 11:00>)
>>> c.tree.sha
'6845f55f47ddfdbe4628a83fdaba35fa4ae3c894'
>>> len(c.parent_shas)
1
>>> c.parent_shas[0]
'ab124ab4baa42cd9f554b7bb038e19d4e3647957'
>>> c.committed_at
datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=<Timezone: 11:00>)
"""
attrs = ('tree', 'parent_shas', 'message', 'full_message', 'author',
'committer', 'authored_at', 'committed_at', 'signature')
if attr not in attrs:
raise AttributeError
for a in attrs:
setattr(self, a, None)
self.header, self.full_message = self.data.split("\n\n", 1)
self.message = self.full_message.split("\n", 1)[0]
parent_shas = []
signature = None
reading_signature = False
for line in self.header.split("\n"):
if reading_signature:
# examples:
# 1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab
# cbbc685c45bdff4da5ea0984f1dd3a73486b4556
signature += line
if line.strip() == "-----END PGP SIGNATURE-----":
self.signature = signature
reading_signature = False
continue
if line.startswith(" "): # mergetag object, not supported (yet?)
# example: c1313c68c7f784efaf700fbfb771065840fc260a
continue
line = line.strip()
if not line: # sometimes there is an empty line after gpgsig
continue
try:
key, value = line.split(" ", 1)
except ValueError:
raise ValueError("Unexpected header in commit " + self.sha)
if key == "tree":
self.tree = Tree(value)
elif key == "parent": # multiple parents possible
parent_shas.append(value)
elif key == "author":
# author name can have arbitrary number of spaces while
# timestamp is guaranteed to have one, so rsplit
chunks = value.rsplit(" ", 2)
self.author = chunks[0]
self.authored_at = parse_commit_date(" ".join(chunks[1:]))
elif key == "committer":
# same logic as author
chunks = value.rsplit(" ", 2)
self.committer = chunks[0]
self.committed_at = parse_commit_date(" ".join(chunks[1:]))
elif key == 'gpgsig':
signature = value
reading_signature = True
self.parent_shas = tuple(parent_shas)
return getattr(self, attr)
def __sub__(self, parent, threshold=0.5):
""" Compare two Commits.
Args:
parent (Commit): another commit to compare to.
Expected order is `diff = child_commit - parent_commit`
Returns:
Generator[Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]]:
4-tuples: `(old_path, new_path, old_sha, new_sha)`
Examples:
- a new file 'setup.py' was created:
`(None, 'setup.py', None, 'file_sha')`
- an existing 'setup.py' was deleted:
`('setup.py', None, 'old_file_sha', None)`
- setup.py.old was renamed to setup.py, content unchanged:
`('setup.py.old', 'setup.py', 'file_sha', 'file_sha')`
- setup.py was edited:
`('setup.py', 'setup.py', 'old_file_sha', 'new_file_sha')`
- setup.py.old was edited and renamed to setup.py:
`('setup.py.old', 'setup.py', 'old_file_sha', 'new_file_sha')`
Detecting the last one is computationally expensive. You can adjust this
behaviour by passing the `threshold` parameter, which is 0.5 by default.
It means that if roughly 50% of the file content is the same,
it is considered a match. `threshold=1` means that only exact
matches are considered, effectively disabling this comparison.
If threshold is set to 0, any pair of deleted and added file will be
considered renamed and edited; this last case doesn't make much sense so
don't set it too low.
"""
if parent.sha not in self.parent_shas:
warnings.warn("Comparing non-adjacent commits might be "
"computationally expensive. Proceed with caution.")
# filename: (blob sha before, blob sha after)
new_files = self.tree.files
new_paths = set(new_files.keys())
old_files = parent.tree.files
old_paths = set(old_files.keys())
# unchanged_paths
for fname in new_paths.intersection(old_paths):
if new_files[fname] != old_files[fname]:
# i.e. the Blob sha is the same
yield (fname, fname, old_files[fname], new_files[fname])
added_paths = new_paths - old_paths
deleted_paths = old_paths - new_paths
if threshold >= 1: # i.e. only exact matches are considered
for fname in added_paths:
yield (None, fname, None, new_files[fname])
for fname in deleted_paths:
yield (fname, None, old_files[fname], None)
return
# search for matches
sm = difflib.SequenceMatcher()
added_blobs = {f: Blob(new_files[f]) for f in added_paths}
deleted_blobs = {f: Blob(old_files[f]) for f in deleted_paths}
# for each added blob, try to find a match in deleted blobs
# if there is a match, signal a rename and remove from deleted
# if there is no match, signal a new file
# unused deleted blobs are indeed deleted
for added_fname, added_blob in added_blobs.items():
sm.set_seq1(added_blob)
matched = False
for deleted_fname, deleted_blob in deleted_blobs.items():
sm.set_seq2(deleted_blob)
# use quick checks first (lower bound by length diff)
if sm.real_quick_ratio() > threshold \
and sm.quick_ratio() > threshold \
and sm.ratio() > threshold:
yield (deleted_fname, added_fname, deleted_blob, added_blob)
del(deleted_blobs[deleted_fname])
matched = True
break
if not matched: # this is a new file
yield (None, added_fname, None, added_blob)
for deleted_fname, deleted_blob in deleted_blobs.items():
yield (deleted_fname, None, deleted_blob, None)
@property
def parents(self):
""" A generator of parent commits.
If you only need hashes (and not `Commit` objects),
use `.parent_sha` instead
Commit: https://github.com/user2589/minicms/commit/e38126db
>>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc')
>>> tuple(c.parents)
(<Commit: ab124ab4baa42cd9f554b7bb038e19d4e3647957>,)
"""
return (Commit(sha) for sha in self.parent_shas)
@cached_property
def project_names(self):
# type: () -> tuple
""" URIs of projects including this commit.
This property can be used to find all forks of a project
by its first commit.
Commit: https://github.com/user2589/minicms/commit/f2a7fcdc
>>> c = Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c')
>>> isinstance(c.project_names, tuple)
True
>>> len(c.project_names) > 0
True
>>> 'user2589_minicms' in c.project_names
True
"""
data = decomp(self.read_tch('commit_projects'))
return tuple(project_name
for project_name in (data and data.split(";")) or []
if project_name and project_name != 'EMPTY')
@property
def projects(self):
""" A generator of `Project` s, in which this commit is included.
"""
return (Project(uri) for uri in self.project_names)
@cached_property
def child_shas(self):
""" Children commit binary sha hashes.
Basically, this is a reverse parent_shas
Commit: https://github.com/user2589/minicms/commit/1e971a07
>>> Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').child_shas
('9bd02434b834979bb69d0b752a403228f2e385e8',)
"""
return slice20(self.read_tch('commit_children'))
@property
def children(self):
""" A generator of children `Commit` objects
Commit: https://github.com/user2589/minicms/commit/1e971a07
>>> tuple(Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').children)
(<Commit: 9bd02434b834979bb69d0b752a403228f2e385e8>,)
"""
return (Commit(sha) for sha in self.child_shas)
@cached_property
def blob_shas(self):
""" SHA hashes of all blobs in the commit
>>> Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blob_shas
... # doctest: +NORMALIZE_WHITESPACE
('b2f49ffef1c8d7ce83a004b34035f917713e2766',
'c92011c5ccc32a9248bd929a6e56f846ac5b8072',
'bf3c2d2df2ef710f995b590ac3e2c851b592c871')
"""
return self.tree.blob_shas
@cached_property
def changed_file_names(self):
data = decomp(self.read_tch('commit_files'))
return tuple((data and data.split(";")) or [])
def files_changed(self):
return (File(filename) for filename in self.changed_file_names)
@property
def blob_shas_rel(self):
"""
This relation is known to miss every first file in all trees.
Consider using Commit.tree.blobs as a slower but more accurate
alternative.
When this relation passes the test, please replace blob_sha with it
It should be faster but as of now it is not accurate
"""
warnings.warn(
"This relation is known to miss every first file in all trees. "
"Consider using Commit.tree.blobs as a slower but more accurate "
"alternative", DeprecationWarning)
return slice20(self.read_tch('commit_blobs'))
@property
def blobs(self):
""" A generator of `Blob` objects included in this commit
>>> tuple(Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blobs)
... # doctest: +NORMALIZE_WHITESPACE
(<Blob: b2f49ffef1c8d7ce83a004b34035f917713e2766>,
<Blob: c92011c5ccc32a9248bd929a6e56f846ac5b8072>,
<Blob: bf3c2d2df2ef710f995b590ac3e2c851b592c871>)
"""
return (Blob(bin_sha) for bin_sha in self.blob_shas)
@cached_property
def files(self):
data = decomp(self.read_tch('commit_files'))
return tuple(file_name
for file_name in (data and data.split(";")) or [] if file_name and file_name != 'EMPTY')
class Commit_info(GitObject):
@cached_property
def time_author(self):
data = self.read_tch('commit_time_author')
return tuple(time_author
for time_author in (data and data.split(";")))
@cached_property
def head(self):
data = slice20(self.read_tch('commit_head'))
return data
class Tag(GitObject):
""" Tag doesn't have any functionality associated.
You can't really do anything useful with it yet
"""
type = 'tag'
[docs]class Project(_Base):
"""
Projects are initialized with a URI:
- Github: `{user}_{repo}`, e.g. `user2589_minicms`
- Gitlab: `gl_{user}_{repo}`
- Bitbucket: `bb_{user}_{repo}`
- Bioconductor: `bioconductor.org_{user}_{repo}`
- kde: `kde.org_{user}_{repo}`
- drupal: `drupal.org_{user}_{repo}`
- Googlesouce: `android.googlesource.com_{repo}_{user}`
- Linux kernel: `git.kernel.org_{user}_{repo}`
- PostgreSQL: `git.postgresql.org_{user}_{repo}`
- GNU Savannah: `git.savannah.gnu.org_{user}_{repo}`
- ZX2C4: `git.zx2c4.com_{user}_{repo}`
- GNOME: `gitlab.gnome.org_{user}_{repo}`
- repo.or.cz: `repo.or.cz_{user}_{repo}`
- Salsa: `salsa.debian.org_{user}_{repo}`
- SourceForge: `sourceforge.net_{user}_{repo}`
Projects are iterable:
>>> for commit in Project('user2589_minicms'): # doctest: +SKIP
... print(commit.sha)
Commits can be checked for membership in a project, either by their SHA
hash or by a Commit object itself:
Commit: https://github.com/user2589/minicms/commit/e38126db
>>> sha = 'e38126dbca6572912013621d2aa9e6f7c50f36bc'
>>> sha in Project('user2589_minicms')
True
>>> Commit(sha) in Project('user2589_minicms')
True
"""
type = 'project'
_keys_registry_dtype = 'project_commits'
def __init__(self, uri):
self.uri = uri
super(Project, self).__init__(uri)
def __iter__(self):
""" Generator of all commits in the project.
Order of commits is not guaranteed
>>> commits = tuple(Project('user2589_minicms'))
>>> len(commits) > 60
True
>>> isinstance(commits[0], Commit)
True
"""
for sha in self.commit_shas:
try:
c = Commit(sha)
author = c.author
except ObjectNotFound:
continue
if author != 'GitHub Merge Button <merge-button@github.com>':
yield c
def __contains__(self, item):
if isinstance(item, Commit):
key = item.key
elif isinstance(item, str):
if len(item) == 20:
key = item.encode('hex')
elif len(item) == 40:
key = item
else:
return False
else:
return False
return key in self.commit_shas
@cached_property
def commit_shas(self):
""" SHA1 of all commits in the project
>>> Project('user2589_django-currencies').commit_shas
... # doctest: +NORMALIZE_WHITESPACE
('2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7',
'7572fc070c44f85e2a540f9a5a05a95d1dd2662d')
"""
tch_path = self.resolve_path('project_commits')
return slice20(read_tch(tch_path, self.key, silent=True))
@property
def commits(self):
""" A generator of all Commit objects in the project.
It has the same effect as iterating a `Project` instance itself,
with some additional validation of commit dates.
>>> tuple(Project('user2589_django-currencies').commits)
... # doctest: +NORMALIZE_WHITESPACE
(<Commit: 2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7>,
<Commit: 7572fc070c44f85e2a540f9a5a05a95d1dd2662d>)
"""
commits = tuple(c for c in self)
tails = tuple(c for c in commits
if not c.parent_shas and c.authored_at is not None)
if tails:
min_date = min(c.authored_at for c in tails)
else: # i.e. if all tails have invalid date
min_date = DAY_Z
for c in commits:
if c.authored_at and c.authored_at < min_date:
c.authored_at = None
yield c
@cached_property
def head(self):
""" Get the HEAD commit of the repository
>>> Project('user2589_minicms').head
<Commit: f2a7fcdc51450ab03cb364415f14e634fa69b62c>
>>> Project('RoseTHERESA_SimpleCMS').head
<Commit: a47afa002ccfd3e23920f323b172f78c5c970250>
"""
# Sometimes (very rarely) commit dates are wrong, so the latest commit
# is not actually the head. The magic below is to account for this
commits = {c.sha: c for c in self.commits}
parents = set().union(*(c.parent_shas for c in commits.values()))
heads = set(commits.keys()) - parents
# it is possible that there is more than one head.
# E.g. it happens when HEAD is moved manually (git reset)
# and continued with a separate chain of commits.
# in this case, let's just use the latest one
# actually, storing refs would make it much simpler
return sorted((commits[sha] for sha in heads),
key=lambda c: c.authored_at or DAY_Z)[-1]
@cached_property
def tail(self):
""" Get the first commit SHA by following first parents
>>> Project('user2589_minicms').tail
'1e971a073f40d74a1e72e07c682e1cba0bae159b'
"""
commits = {c.sha: c for c in self.commits}
pts = set(c.parent_shas[0] for c in commits.values() if c.parent_shas)
for sha, c in commits.items():
if sha in pts and not c.parent_shas:
return sha
@property
def commits_fp(self):
""" Get a commit chain by following only the first parent, to mimic
https://git-scm.com/docs/git-log#git-log---first-parent .
Thus, you only get a small subset of the full commit tree:
>>> p = Project('user2589_minicms')
>>> set(c.sha for c in p.commits_fp).issubset(p.commit_shas)
True
In scenarios where branches are not important, it can save a lot
of computing.
Note: commits will come in order from the latest to the earliest.
"""
# Simplified version of self.head():
# - slightly less precise,
# - 20% faster
#
# out of 500 randomly sampled projects, 493 had the same head.
# In the remaining 7:
# 2 had the same commit chain length,
# 3 had one more commit
# 1 had two more commits
# 1 had three more commits
# Execution time:
# simplified version (argmax): ~153 seconds
# self.head(): ~190 seconds
# at this point we know all commits are in the dataset
# (validated in __iter___)
commits = {c.sha: c for c in self.commits}
commit = max(commits.values(), key=lambda c: c.authored_at or DAY_Z)
while commit:
try: # here there is no guarantee commit is in the dataset
first_parent = commit.parent_shas and commit.parent_shas[0]
except ObjectNotFound:
break
yield commit
if not first_parent:
break
commit = commits.get(first_parent, Commit(first_parent))
def toURL(self):
'''
Get the URL for a given project URI
>>> Project('CS340-19_lectures').toURL()
'http://github.com/CS340-19/lectures'
'''
p_name = self.uri
found = False
toUrlMap = {
"bb": "bitbucket.org", "gl": "gitlab.org",
"android.googlesource.com": "android.googlesource.com",
"bioconductor.org": "bioconductor.org",
"drupal.com": "git.drupal.org", "git.eclipse.org": "git.eclipse.org",
"git.kernel.org": "git.kernel.org",
"git.postgresql.org": "git.postgresql.org" ,
"git.savannah.gnu.org": "git.savannah.gnu.org",
"git.zx2c4.com": "git.zx2c4.com" ,
"gitlab.gnome.org": "gitlab.gnome.org",
"kde.org": "anongit.kde.org",
"repo.or.cz": "repo.or.cz",
"salsa.debian.org": "salsa.debian.org",
"sourceforge.net": "git.code.sf.net/p"}
for URL in toUrlMap.keys():
URL_ = URL + "_"
if p_name.startswith(URL_) and (p_name.count('_') >= 2 or URL == "sourceforge.net"):
replacement = toUrlMap[URL] + "/"
p_name = p_name.replace(URL_, replacement)
found = True
break
if not found:
p_name = "github.com/" + p_name
p_name = p_name.replace('_', '/', 1)
return "https://" + p_name
@cached_property
def author_names(self):
data = decomp(self.read_tch('project_authors'))
return tuple(author_name
for author_name in (data and data.split(";")) or [] if author_name and author_name != 'EMPTY')
[docs]class File(_Base):
"""
Files are initialized with a path, starting from a commit root tree:
>>> File('.gitignore') # doctest: +SKIP
>>> File('docs/Index.rst') # doctest: +SKIP
"""
type = 'file'
_keys_registry_dtype = 'file_commits'
def __init__(self, path):
self.path = path
super(File, self).__init__(path)
@cached_property
def authors(self):
data = decomp(self.read_tch('file_authors'))
return tuple(author for author in (data and data.split(";")))
@cached_property
def commit_shas(self):
""" SHA1 of all commits changing this file
**NOTE: this relation considers only diff with the first parent,
which substantially limits its application**
>>> commits = File('minicms/templatetags/minicms_tags.py').commit_shas
>>> len(commits) > 0
True
>>> isinstance(commits, tuple)
True
>>> isinstance(commits[0], str)
True
>>> len(commits[0]) == 40
True
"""
file_path = self.key
#if not file_path.endswith("\n"):
# file_path += "\n"
tch_path = resolve_path('file_commits', file_path, self.use_fnv_keys)
return slice20(read_tch(tch_path, file_path, silent=True))
@property
def commits(self):
""" All commits changing the file
.. note: this relation considers only diff with the first parent,
which substantially limits its application
>>> cs = tuple(File('minicms/templatetags/minicms_tags.py').commits)
>>> len(cs) > 0
True
>>> isinstance(cs[0], Commit)
True
"""
for sha in self.commit_shas:
c = Commit(sha)
try:
author = c.author
except ObjectNotFound:
continue
if author != 'GitHub Merge Button <merge-button@github.com>':
yield c
def __str__(self):
return super(File, self).__str__().rstrip("\n\r")
[docs]class Author(_Base):
"""
Authors are initialized with a combination of name and email, as they
appear in git configuration.
>>> Author('John Doe <john.doe@aol.com>') # doctest: +SKIP
At this point we don't have a relation to map all aliases of the same
author, so keep in mind this object represents an alias, not a person.
"""
type = 'author'
_keys_registry_dtype = 'author_commits'
def __init__(self, full_email):
self.full_email = full_email
super(Author, self).__init__(full_email)
@cached_property
def commit_shas(self):
""" SHA1 of all commits authored by the Author
>>> commits = Author('user2589 <valiev.m@gmail.com>').commit_shas
>>> len(commits) > 50
True
>>> isinstance(commits, tuple)
True
>>> isinstance(commits[0], str)
True
>>> len(commits[0]) == 40
True
"""
return slice20(self.read_tch('author_commits', silent=True))
@property
def commits(self):
""" A generator of all Commit objects authored by the Author
>>> commits = tuple(Author('user2589 <valiev.m@gmail.com>').commits)
>>> len(commits) > 50
True
>>> isinstance(commits[0], Commit)
True
"""
return (Commit(sha) for sha in self.commit_shas)
@cached_property
def files(self):
data = decomp(self.read_tch('author_files'))
return tuple(file for file in (data and data.split(";")))
@cached_property
def project_names(self):
""" URIs of projects where author has committed to
A generator of all Commit objects authored by the Author
"""
data = decomp(self.read_tch('author_projects'))
return tuple(project_name
for project_name in (data and data.split(";")) or [] if project_name and project_name != 'EMPTY')
@cached_property
def torvald(self):
data = decomp(self.read_tch('author_trpath'))
return tuple(path for path in (data and data.split(";")))
class Clickhouse_DB(object):
''' Clickhouse_DB class represents an instance of the clickhouse client
It is initialized with a table name and a host name for the database
'''
def __init__(self, tb_name, db_host):
self.tb_name = tb_name
self.db_host = db_host
self.client_settings = {'strings_as_bytes':True, 'max_block_size':100000}
self.client = clickhouse.Client(host=self.db_host, settings=self.client_settings)
def query(self, query_str):
return self.client.execute(query_str)
def query_iter(self, query_str):
row_iter = self.client.execute_iter(query_str)
for row in row_iter:
yield row
def query_select(self, s_col, s_from, s_start, s_end):
# normal query
s_where = self.__where_condition(s_start, s_end)
query_str = 'select {} from {} where {}'.format(s_col, s_from, s_where)
return self.client.execute(query_str)
def query_select_iter(self, s_col, s_from, s_start, s_end):
# iterative query
s_where = self.__where_condition(s_start, s_end)
query_str = 'select {} from {} where {}'.format(s_col, s_from, s_where)
row_iter = self.client.execute_iter(query_str)
for row in row_iter:
yield row
def __where_condition(self, start, end):
# checks if start and end date or time is valid and build the where clause
dt = 'time'
if not self.__check_time(start, end):
dt = 'date'
start = 'toDate(\'{}\')'.format(start)
end = 'toDate(\'{}\')'.format(end) if end else None
if end is None:
return '{}={}'.format(dt, start)
else:
return '{}>={} AND {}<={}'.format(dt, start, dt, end)
def __check_time(self, start, end):
# make sure start and end are of the same type and must be either strings or ints
if start is None:
raise ValueError('start time cannot be None')
elif not isinstance(start, int) and not isinstance(start, basestring):
raise ValueError('start time must be either int or string')
elif end is not None and not isinstance(end, int) and not isinstance(end, basestring):
raise ValueError('end time must be either int or string')
elif end is not None and type(start) is not type(end):
raise ValueError('start and end must be of the same type')
return (True if isinstance(start, int) else False)
class Time_commit_info(Clickhouse_DB):
''' Time_commit_info class is initialized with table name and database host name
the default table for commits is commits_all, and the default host is localhost
No connection is established before the query is made.
The 'commits_all' table description is the following:
|__name___|______type_______|
| sha1 | FixedString(20) |
| time | Int32 |
| tree | FixedString(20) |
| author | String |
| parent | String |
| comment | String |
| content | String |
'''
columns = ['sha1', 'time', 'tree', 'author', 'parent', 'comment', 'content']
def __init__(self, tb_name='commits_all', db_host='localhost'):
super(Time_commit_info, self).__init__(tb_name, db_host)
def commit_counts(self, start, end=None):
''' return the count of commits between given date and time
>>> t = Time_commit_info()
>>> t.commit_counts(1568656268)
8
'''
rows = self.query_select('count(*)', self.tb_name, start, end)
return rows[0][0]
def commits_iter(self, start, end=None):
''' return a generator of Commit instances within a given date and time
>>> t = Time_commit_info()
>>> commits = t.commits_iter(1568656268)
>>> c = commits.next()
>>> type(c)
<class 'oscar.Commit'>
>>> c.parent_shas
('9c4cc4f6f8040ed98388c7dedeb683469f7210f5',)
'''
row_iter = self.query_select_iter('lower(hex(sha1))', self.tb_name, start, end)
for row in row_iter:
yield Commit(row[0])
def commits_shas(self, start, end=None):
''' return a list of shas within the given time and date
>>> t = Time_commit_info()
>>> shas = t.commits_shas(1568656268)
>>> type(shas)
<type 'list'>
'''
rows = self.query_select('lower(hex(sha1))', self.tb_name, start, end)
return [row[0] for row in rows]
def commits_shas_iter(self, start, end=None):
''' return a generator of all sha1 within the given time and date
>>> t = Time_commit_info()
>>> for sha1 in t.commits_shas_iter(1568656268):
... print(sha1)
'''
row_iter = self.query_select_iter('lower(hex(sha1))', self.tb_name, start, end)
for row in row_iter:
yield row[0]
class Time_project_info(Clickhouse_DB):
''' Time_project_info class is initialized with table name and database host name
The default table name for projects is projects_all, and the default database name is localhost
This class contains methods to query for project data
The 'b2cPtaPkgR_all' table descrption is the following:
|___name___|______type_______|
| blob | FixedString(20) |
| commit | FixedString(20) |
| project | String |
| time | UInt32 |
| author | String |
| language | String |
| deps | String |
'''
columns = ['blob', 'commit', 'project', 'time', 'author', 'language', 'deps']
def __init__(self, tb_name='b2cPtaPkgR_all', db_host='localhost'):
super(Time_project_info, self).__init__(tb_name, db_host)
def get_values_iter(self, cols, start, end):
''' return a generator for table rows for a given time interval
>>> from oscar import Time_project_info as Proj
>>> p = Proj()
>>> rows = p.get_values_iter(['time','project'], 1568571909, 1568571910)
>>> for row in rows:
... print(row)
...
(1568571909, 'mrtrevanderson_CECS_424')
(1568571909, 'gitlab.com_surajpatel_tic_toc_toe')
(1568571909, 'gitlab.com_surajpatel_tic_toc_toe')
...
'''
cols = self.__wrap_cols(cols)
rows_iter = self.query_select_iter(', '.join(cols), self.tb_name, start, end)
for row in rows_iter:
yield row
def project_timeline(self, cols, project):
''' return a generator for all rows given a project name (ordered by time)
>>> rows = p.project_timeline(['time','project'], 'mrtrevanderson_CECS_424')
>>> for row in rows:
... print(row)
...
(1568571909, 'mrtrevanderson_CECS_424')
(1568571909, 'mrtrevanderson_CECS_424')
(1568571909, 'mrtrevanderson_CECS_424')
...
'''
cols = self.__wrap_cols(cols)
query_str = 'SELECT {} FROM {} WHERE project=\'{}\' ORDER BY time'\
.format(', '.join(cols), self.tb_name, project)
rows_iter = self.query_iter(query_str)
for row in rows_iter:
yield row
def author_timeline(self, cols, author):
''' return a generator for all rows given an author (ordered by time)
>>> rows = p.author_timeline(['time', 'project'], 'Andrew Gacek <andrew.gacek@gmail.com>')
>>> for row in rows:
... print(row)
...
(49, 'smaccm_camera_demo')
(677, 'smaccm_vm_hack')
(1180017188, 'teyjus_teyjus')
...
'''
cols = self.__wrap_cols(cols)
query_str = 'SELECT {} FROM {} WHERE author=\'{}\' ORDER BY time'\
.format(', '.join(cols), self.tb_name, author)
rows_iter = self.query_iter(query_str)
for row in rows_iter:
yield row
def __wrap_cols(self, cols):
''' wraps cols to select before querying
'''
for i in range(len(cols)):
if cols[i] == 'commit' or cols[i] == 'blob':
cols[i] = 'lower(hex({}))'.format(cols[i])
return cols