woc.local

def fnvhash(data: bytes) -> int:

Returns the 32 bit FNV-1a hash value for the given data.

>>> hex(fnvhash('foo'))
'0xa9f37ed7'
def unber(buf: bytes) -> bytes:

Perl BER unpacking. BER is a way to pack several variable-length ints into one binary string. Here we do the reverse. Format definition: from http://perldoc.perl.org/functions/pack.html (see "w" template description)

Parameters
  • buf: a binary string with packed values
Returns

a list of unpacked values

>>> unber(b'\x00\x83M')
[0, 461]
>>> unber(b'\x83M\x96\x14')
[461, 2836]
>>> unber(b'\x99a\x89\x12')
[3297, 1170]
def lzf_length(raw_data: bytes) -> Tuple[int, int]:

Get length of uncompressed data from a header of Compress::LZF output.

Check Compress::LZF sources for the definition of this bit magic: (namely, LZF.xs, decompress_sv) https://metacpan.org/source/MLEHMANN/Compress-LZF-3.8/LZF.xs

Parameters
  • **raw_data: data compressed with Perl Compress:**: LZF
Returns

(header_size, uncompressed_content_length) in bytes

>>> lzf_length(b'\xc4\x9b')
(2, 283)
>>> lzf_length(b'\xc3\xa4')
(2, 228)
>>> lzf_length(b'\xc3\x8a')
(2, 202)
>>> lzf_length(b'\xca\x87')
(2, 647)
>>> lzf_length(b'\xe1\xaf\xa9')
(3, 7145)
>>> lzf_length(b'\xe0\xa7\x9c')
(3, 2524)
def decomp(data: bytes) -> bytes:

lzf wrapper to handle perl tweaks in Compress::LZF

This function extracts uncompressed size header and then does usual lzf decompression.

Parameters
  • **raw_data: data compressed with Perl Compress:**: LZF
Returns

unpacked data

def decomp_or_raw(data: bytes) -> bytes:

Try to decompress raw_data, return raw_data if it fails

def slice20(raw_data):

Slice raw_data into 20-byte chunks and hex encode each of them It returns tuple in order to be cacheable

def decode_str(raw_data: str, encoding='utf-8'):

Decode raw_data, detect the encoding if utf-8 fails

def get_tch(path: str):

Cache TCHashDB objects

def get_shard(key: bytes, sharding_bits: int, use_fnv_keys: bool) -> int:

Get shard id

def decode_value(value: bytes, out_dtype: str):

Decode values from tch maps.

def decode_tree(value: bytes) -> List[Tuple[str, str, str]]:

Decode a tree binary object into tuples.

Python: 4.77 µs, Cython: 280 ns Reference: https://stackoverflow.com/questions/14790681/

>>> decode_tree(b'100644 .gitignore\x00\x8e\x9e\x1f...')
[('100644', '.gitignore', '8e9e1...'), ...]
def decode_commit( commit_bin: bytes) -> Tuple[str, Tuple[str, str, str], Tuple[str, str, str], str]:

Decode git commit objects into tuples.

Python: 2.35 µs, Cython: 855 ns Reference: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects

>>> decode_commit(b'tree f1b66dcca490b5c4455af319bc961a34f69c72c2\n...')
('f1b66dcca490b5c4455af319bc961a34f69c72c2',
 ('c19ff598808b181f1ab2383ff0214520cb3ec659',),
 ('Audris Mockus <audris@utk.edu> 1410029988', '1410029988', '-0400'),
 ('Audris Mockus <audris@utk.edu>', '1410029988', '-0400'),
 'News for Sep 5, 2014\n')
def decode_tag(tag: bytes):

Decode git tag objects into tuples.

decode_tag(b'object fcadcb9366d4a011039e384affa10961e99cf2c4 type commit tag eccube-2.11.1 tagger nanasess 1303788649 +0000

Added tags/eccube-2.11.1 ') ('fcadcb9366d4a011039e384affa10961e99cf2c4', 'commit', 'eccube-2.11.1', 'nanasess ' , '1303788649', '+0000')

def read_large_random_access( path: str, dtype: str, offset: int = 0, length: int = 131072) -> Tuple[bytes, Union[int, NoneType]]:

Read a .large. and return its content.

Parameters
  • path: path to the file
  • dtype: data type
  • offset: offset to start reading. It is either 0 or after the last separator.
  • length: length to read. It should be longer than the longest record.
Returns

a tuple of bytes and the next offset, None if EOF. Returned bytes must not begin or end with a separator.

class WocMapsLocal(woc.base.WocMapsBase):
WocMapsLocal( profile_path: Union[str, Iterable[str], NoneType] = None, version: Union[str, Iterable[str], NoneType] = None, on_large: Literal['ignore', 'head', 'all'] = 'all')

Initialize local WoC maps with a profile.

Parameters
  • profile_path: path to the woc profile. if not provided, use ./wocprofile.json, ~/.wocprofile.json, /home/wocprofile.json, /etc/wocprofile.json.
  • version: version of the profile, default to the latest version. can be a single version like 'R' or a list of versions like ['R', 'U'].
  • on_large: how to handle large files, default to 'all' (read all content). 'ignore' to ignore large files, 'head' to read only the first chunk.
def get_values( self, map_name: str, key: Union[bytes, str]) -> Union[List[str], Tuple[str, str, str], List[Tuple[str, str, str]]]:

Eqivalent to getValues in WoC Perl API.

Parameters
  • map_name: The name of the map, e.g. 'c2p', 'c2r', 'P2c'
  • key: The key of the object. For git objects, it is the SHA-1 hash of the object (in bytes or hex string). For other objects like Author, it is the name of the object.
Returns

The value of the object. Can be a list of strings, a tuple of strings, or a list of tuples of strings. Please refer to the documentation for details.

>>> self.get_values('P2c', 'user2589_minicms')
['05cf84081b63cda822ee407e688269b494a642de', ...]
>>> self.get_values('c2r', 'e4af89166a17785c1d741b8b1d5775f3223f510f')
('9531fc286ef1f4753ca4be9a3bf76274b929cdeb', 27)
>>> self.get_values('b2fa', '05fe634ca4c8386349ac519f899145c75fff4169')
('1410029988',
 'Audris Mockus <audris@utk.edu>',
 'e4af89166a17785c1d741b8b1d5775f3223f510f')
def iter_values( self, map_name: str, key: Union[bytes, str]) -> Generator[Union[List[str], Tuple[str, str, str], List[Tuple[str, str, str]]], NoneType, NoneType]:

Similar to get_values, but returns a generator instead of a list. This is useful when querying large maps (on_large='all').

Parameters
  • map_name: The name of the map, e.g. 'c2p', 'c2r', 'P2c'
  • key: The key of the object. For git objects, it is the SHA-1 hash of the object (in bytes or hex string). For other objects like Author, it is the name of the object.
Returns

The value of the object. Can be a list of strings, a tuple of strings, or a list of tuples of strings. Please refer to the documentation for details.

>>> list(self.iter_values('P2c', 'user2589_minicms'))
['05cf84081b63cda822ee407e688269b494a642de', ...]
def show_content( self, obj_name: Literal['tree', 'blob', 'commit', 'tkns', 'tag', 'bdiff'], key: Union[bytes, str]) -> Union[List[Tuple[str, str, str]], str, Tuple[str, Tuple[str, str, str], Tuple[str, str, str], str]]:

Eqivalent to showCnt in WoC Perl API.

Parameters
  • obj_name: The name of the object, e.g. 'blob', 'tree', 'commit'
  • key: The key of the object. It is the SHA-1 hash of the object (in bytes or hex string).
Returns

The content of the object. Can be a list of tuples of strings, a string, or a tuple of strings.

>>> self.show_content('blob', '05fe634ca4c8386349ac519f899145c75fff4169')
'This is the content of the blob'
Eqivalent to showCnt in WoC perl API
>>> self.show_content('tree', '7a374e58c5b9dec5f7508391246c48b73c40d200')
[('100644', '.gitignore', '8e9e1...'), ...]
>>> self.show_content('commit', 'e4af89166a17785c1d741b8b1d5775f3223f510f')
('f1b66dcca490b5c4455af319bc961a34f69c72c2',
 ('c19ff598808b181f1ab2383ff0214520cb3ec659',),
 ('Audris Mockus <audris@utk.edu> 1410029988', '1410029988', '-0400'),
 ('Audris Mockus <audris@utk.edu>', '1410029988', '-0400'),
'News for Sep 5, 2014\n')
def count(self, map_name: str) -> int:

Count the number of keys in a map.

Parameters
  • map_name: The name of the mapping / object, e.g. 'c2p', 'c2r', 'commit'.
Returns

The number of keys in the tch databases plus the number of large files.

>>> self.count('c2r')
12345
def all_keys(self, map_name: str) -> Generator[bytes, NoneType, NoneType]:

Iterate over all keys in a map.

Parameters
  • map_name: The name of the mapping / object, e.g. 'c2p', 'c2r', 'commit'. When on_large is 'ignore', keys in large maps are excluded.
Returns

A generator of keys in the map.

>>> for key in self.iter_map('P2c'):
...     print(key)  # hash or encoded string
Inherited Members
woc.base.WocMapsBase
maps
objects