woc.detect
1#!/usr/bin/env python3 2 3# SPDX-License-Identifier: GPL-3.0-or-later 4# @authors: Runzhi He <rzhe@pku.edu.cn> 5# @date: 2024-01-17 6 7import argparse 8import json 9import logging 10import os 11import re 12from functools import cmp_to_key 13from typing import Iterable, Optional, Tuple 14 15from tqdm import tqdm 16 17from .utils import sample_md5 18 19_default_profile = os.path.join(os.path.dirname(__file__), "wocprofile.default.json") 20_logger = logging.getLogger(__name__) 21_logger.setLevel(logging.INFO) 22 23MAP_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$" 24"""Filename regex for basemap files""" 25_map_pat = re.compile(MAP_REGEX) 26 27 28def parse_map_fname(fname: str): 29 """ 30 Parse basemap filename into (src, dst, ver, idx). 31 32 >>> parse_map_fname('c2fFullR.3.tch') 33 ('c', 'f', 'R', '3') 34 >>> parse_map_fname('c2fFullR.tch') 35 ('c', 'f', 'R', None) 36 """ 37 m = _map_pat.match(fname) 38 if not m or len(m.groups()) != 4: 39 return None 40 return m.groups() 41 42 43LARGE_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch.large.([0-9a-f]+)$" 44"""Filename regex for large basemap files""" 45_large_pat = re.compile(LARGE_REGEX) 46 47 48def parse_large_fname(fname: str): 49 """ 50 Parse basemap filename into (src, dst, ver, idx, hash). 51 52 >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f') 53 ('A', 'c', 'U', '15', '59016a4f') 54 """ 55 m = _large_pat.match(fname) 56 if not m or len(m.groups()) != 5: 57 return None 58 return m.groups() 59 60 61OBJ_REGEX = r"^([\w\.]+)_(\d+).(idx|bin|tch)$" 62"""Filename regex for object files""" 63_obj_pat = re.compile(OBJ_REGEX) 64 65 66def parse_obj_fname(fname: str): 67 """ 68 Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext). 69 70 >>> parse_obj_fname('commit_0.tch') 71 ('commit', '0', 'tch') 72 >>> parse_obj_fname('blob_0.idx') 73 ('blob', '0', 'idx') 74 >>> parse_obj_fname('sha1.blob_0.bin') 75 ('sha1.blob', '0', 'bin') 76 """ 77 m = _obj_pat.match(fname) 78 if not m or len(m.groups()) != 3: 79 return None 80 return m.groups() 81 82 83def compare_woc_version(ver1: str, ver2: str): 84 """ 85 Compare two woc version strings (A < Z < AA). 86 87 >>> compare_woc_version('S', 'T') > 0 88 False 89 >>> compare_woc_version('AA', 'U') > 0 90 True 91 """ 92 if len(ver1) != len(ver2): 93 return len(ver1) - len(ver2) 94 return ord(ver1[0]) - ord(ver2[0]) 95 96 97def infer_dtype(map_name: str) -> Tuple[str, str]: 98 """ 99 Infer the data types from the map's name (entity -> entity). 100 101 Should be bug-to-bug compatible with: 102 https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34 103 >>> infer_dtype('c2f') 104 ('h', 'cs') 105 >>> infer_dtype('b2tac') 106 ('h', 'cs3') 107 """ 108 ent_all = map_name.lower() 109 ent_in, ent_out = ent_all.split("2") 110 111 dtype_in, dtype_out = "h", "h" 112 113 if ent_in in ("a", "f", "p"): 114 dtype_in = "s" 115 if ent_out in ("a", "f", "p"): 116 dtype_out = "cs" 117 if ent_in in ("c", "b", "w", "ob", "td"): 118 dtype_in = "h" 119 if ent_out in ("c", "b", "cc", "pc", "ob", "td"): 120 dtype_out = "h" 121 if ent_all in ("b2fa", "c2tag"): 122 dtype_out = "sh" 123 if ent_out in ("ta",): 124 dtype_out = "s" 125 if ent_all in ("b2tk", "td2f"): 126 dtype_out = "s" 127 if ent_all in ("c2h", "c2r"): 128 dtype_out = "r" 129 if ent_in in ("ps", "pf", "pfs"): 130 dtype_in = "s" 131 if ent_out in ("ps", "pf", "pfs"): 132 dtype_out = "s" 133 if ent_out in ("rhp",): 134 dtype_out = "hhwww" 135 if ent_all in ("p2p", "a2a"): 136 dtype_in, dtype_out = "s", "cs" 137 if ent_all in ("b2baddate", "b2manyp"): 138 dtype_in, dtype_out = "s", "h" 139 if ent_all in ("c2fbb", "obb2cf", "bb2cf"): 140 dtype_in, dtype_out = "h", "cs3" 141 if ent_all in ("c2dat",): 142 dtype_in, dtype_out = "h", "s" 143 if ent_all in ("b2tac",): 144 dtype_in, dtype_out = "h", "cs3" 145 146 return dtype_in, dtype_out 147 148 149def detect_profile( 150 paths: Iterable[str], 151 version: Optional[str] = None, 152 preset_path: Optional[str] = None, 153 check_missing: bool = True, 154 with_digest: bool = False, 155): 156 _maps, _objs = {}, {} 157 158 if not preset_path: 159 preset_path = _default_profile 160 161 def _handle_map(src, dst, ver, idx, hash): 162 if version and ver != version: 163 logging.info(f"Found map {f} with version {ver}, expected {version}") 164 return 165 166 _map_name = f"{src}2{dst}" 167 if idx is None: 168 idx = "0" 169 prefix_len = int(idx).bit_length() 170 171 _map = _maps.setdefault(_map_name, {}).setdefault( 172 ver, 173 { 174 "version": ver, 175 "sharding_bits": prefix_len, 176 "shards": {}, 177 "larges": {}, 178 "dtypes": infer_dtype(_map_name), 179 }, 180 ) 181 if not hash: 182 logging.debug(f"Found map {f} with hash {hash} idx {idx}") 183 _map["shards"][int(idx)] = os.path.join(root, f) 184 else: 185 logging.debug(f"Found large map {f} with hash {hash} idx {idx}") 186 _map["larges"][hash] = os.path.join(root, f) 187 _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len) 188 189 def _handle_obj(name, idx, ext): 190 # quirk for "All.sha1" folder 191 # we don't want sha1.blob_*.tch files 192 if root.endswith("All.sha1"): 193 return 194 195 _map_name = f"{name}.{ext}" 196 prefix_len = int(idx).bit_length() if idx else 0 197 _obj = _objs.setdefault( 198 _map_name, 199 { 200 "sharding_bits": prefix_len, 201 "shards": {}, 202 }, 203 ) 204 logging.debug(f"Found obj {f} idx {idx}") 205 _obj["shards"][int(idx)] = os.path.join(root, f) 206 _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len) 207 208 for path in paths: 209 # walk the directory for all files 210 for root, _, files in os.walk(path): 211 # only consider .tch, .idx, .bin files 212 files = [ 213 f 214 for f in files 215 if ".tch" in f 216 or (not f.startswith("pack") and f.endswith(".idx")) 217 or f.endswith(".bin") 218 ] 219 for idx, f in enumerate(tqdm(files, desc=root)): 220 _r = parse_map_fname(f) 221 if _r: 222 src, dst, ver, idx = _r 223 _handle_map(src, dst, ver, idx, None) 224 continue 225 226 _r = parse_large_fname(f) 227 if _r: 228 src, dst, ver, idx, hash = _r 229 _handle_map(src, dst, ver, idx, hash) 230 continue 231 232 _r = parse_obj_fname(f) 233 if _r: 234 name, idx, ext = _r 235 _handle_obj(name, idx, ext) 236 continue 237 _logger.warning(f"Unrecognized file: {f}") 238 239 # transform maps 240 _ls_maps = {} 241 for k, v in _maps.items(): 242 _to_drop = [] 243 for ver, vv in v.items(): 244 # convert shards to list 245 _ls = [None] * 2 ** vv["sharding_bits"] 246 for kkk, vvv in vv["shards"].items(): 247 _ls[kkk] = vvv 248 # see if we can find the None in _ls 249 _nones = [i for i, x in enumerate(_ls) if x is None] 250 if _nones and check_missing: 251 _logger.warning( 252 f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping' 253 ) 254 _logger.warning(f"Got: {vv['shards']}") 255 _to_drop.append(ver) 256 else: 257 vv["shards"] = _ls 258 for ver in _to_drop: 259 del v[ver] 260 261 # move latest maps to the front of the list 262 if len(v) == 0: 263 continue 264 _ls_maps[k] = [ 265 vv 266 for _, vv in sorted( 267 v.items(), 268 key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), 269 reverse=True, 270 ) 271 ] 272 273 # transform objects 274 _ls_objs = {} 275 for k, v in _objs.items(): 276 # convert shards to list 277 _ls = [None] * 2 ** v["sharding_bits"] 278 for kk, vv in v["shards"].items(): 279 _ls[kk] = vv 280 # see if we can find the None in _ls 281 _nones = [i for i, x in enumerate(_ls) if x is None] 282 if _nones and check_missing: 283 _logger.warning( 284 f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping' 285 ) 286 _logger.warning(f"Got: {v['shards']}") 287 else: 288 v["shards"] = _ls 289 _ls_objs[k] = v 290 291 # quirk for da* servers 292 # follow symlinks to avoid the overhead of NFS 293 def _resolve_path(file_path: str) -> str: 294 _resolved = os.path.realpath(file_path) 295 if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved): 296 _logger.warning(f"Resolve {file_path} to {_resolved}") 297 return _resolved 298 return file_path 299 300 # transform to v2 301 _total_files = 0 302 for l_maps in _ls_maps.values(): 303 for _map in l_maps: 304 # iterate over shards 305 _new_shards = [] 306 for shard_path in _map["shards"]: 307 if shard_path is None: 308 continue 309 _new_shards.append( 310 { 311 "path": _resolve_path(shard_path), 312 "size": os.path.getsize(shard_path), 313 "digest": None, 314 } 315 ) 316 _map["shards"] = _new_shards 317 _total_files += len(_new_shards) 318 319 # iterate over larges 320 _new_larges = {} 321 for hash, large_path in _map["larges"].items(): 322 _new_larges[hash] = { 323 "path": _resolve_path(large_path), 324 "size": os.path.getsize(large_path), 325 "digest": None, 326 } 327 _map["larges"] = _new_larges 328 _total_files += len(_new_larges) 329 330 for obj_name, obj in _ls_objs.items(): 331 _new_shards = [] 332 for shard_path in obj["shards"]: 333 if shard_path is None: 334 continue 335 _new_shards.append( 336 { 337 "path": _resolve_path(shard_path), 338 "size": os.path.getsize(shard_path), 339 "digest": None, 340 } 341 ) 342 obj["shards"] = _new_shards 343 _total_files += len(_new_shards) 344 345 # aliases 346 if obj_name == "tree.tch": 347 obj["alias"] = "tree" 348 elif obj_name == "commit.tch": 349 obj["alias"] = "commit" 350 elif obj_name == "sha1.blob.tch": 351 obj["alias"] = "blob" 352 353 if with_digest: 354 with tqdm(total=_total_files, desc="Calculating digests") as pbar: 355 for l_maps in _ls_maps.values(): 356 for _map in l_maps: 357 for shard in _map["shards"]: 358 if shard is None: 359 continue 360 shard["digest"] = sample_md5(shard["path"]) 361 pbar.update(1) 362 for large in _map["larges"].values(): 363 large["digest"] = sample_md5(large["path"]) 364 pbar.update(1) 365 366 for obj in _ls_objs.values(): 367 for shard in obj["shards"]: 368 if shard is None: 369 continue 370 shard["digest"] = sample_md5(shard["path"]) 371 pbar.update(1) 372 373 # load the preset profile 374 with open(preset_path, "r") as f: 375 res = json.load(f) 376 377 res["maps"] = _ls_maps 378 res["objects"] = _ls_objs 379 return res 380 381 382if __name__ == "__main__": 383 import doctest 384 385 doctest.testmod() 386 387 parser = argparse.ArgumentParser(description="Detect woc profile") 388 parser.add_argument( 389 "paths", metavar="PATH", type=str, nargs="+", help="path to woc directory" 390 ) 391 parser.add_argument("--version", type=str, default=None, help="woc mapping version") 392 parser.add_argument( 393 "--preset", type=str, default=_default_profile, help="path to preset profile" 394 ) 395 parser.add_argument("--output", type=str, default=None, help="path to output profile") 396 parser.add_argument( 397 "--no-skip-missing", 398 dest="check_missing", 399 action="store_false", 400 help="do not check missing shards", 401 ) 402 parser.add_argument( 403 "--with-digest", 404 dest="with_digest", 405 action="store_true", 406 help="calculate digest for each file", 407 default=False, 408 ) 409 410 args = parser.parse_args() 411 412 res = detect_profile( 413 args.paths, args.version, args.preset, args.check_missing, args.with_digest 414 ) 415 if args.output: 416 with open(args.output, "w") as f: 417 json.dump(res, f, indent=2) 418 else: 419 print(json.dumps(res, indent=2))
MAP_REGEX =
'^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch$'
Filename regex for basemap files
def
parse_map_fname(fname: str):
29def parse_map_fname(fname: str): 30 """ 31 Parse basemap filename into (src, dst, ver, idx). 32 33 >>> parse_map_fname('c2fFullR.3.tch') 34 ('c', 'f', 'R', '3') 35 >>> parse_map_fname('c2fFullR.tch') 36 ('c', 'f', 'R', None) 37 """ 38 m = _map_pat.match(fname) 39 if not m or len(m.groups()) != 4: 40 return None 41 return m.groups()
Parse basemap filename into (src, dst, ver, idx).
>>> parse_map_fname('c2fFullR.3woc.tch')
('c', 'f', 'R', '3')
>>> parse_map_fname('c2fFullR.tch')
('c', 'f', 'R', None)
LARGE_REGEX =
'^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch.large.([0-9a-f]+)$'
Filename regex for large basemap files
def
parse_large_fname(fname: str):
49def parse_large_fname(fname: str): 50 """ 51 Parse basemap filename into (src, dst, ver, idx, hash). 52 53 >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f') 54 ('A', 'c', 'U', '15', '59016a4f') 55 """ 56 m = _large_pat.match(fname) 57 if not m or len(m.groups()) != 5: 58 return None 59 return m.groups()
Parse basemap filename into (src, dst, ver, idx, hash).
>>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
('A', 'c', 'U', '15', '59016a4f')
OBJ_REGEX =
'^([\\w\\.]+)_(\\d+).(idx|bin|tch)$'
Filename regex for object files
def
parse_obj_fname(fname: str):
67def parse_obj_fname(fname: str): 68 """ 69 Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext). 70 71 >>> parse_obj_fname('commit_0.tch') 72 ('commit', '0', 'tch') 73 >>> parse_obj_fname('blob_0.idx') 74 ('blob', '0', 'idx') 75 >>> parse_obj_fname('sha1.blob_0.bin') 76 ('sha1.blob', '0', 'bin') 77 """ 78 m = _obj_pat.match(fname) 79 if not m or len(m.groups()) != 3: 80 return None 81 return m.groups()
Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
>>> parse_obj_fname('commit_0.tch')
('commit', '0', 'tch')
>>> parse_obj_fname('blob_0.idx')
('blob', '0', 'idx')
>>> parse_obj_fname('sha1.blob_0.bin')
('sha1.blob', '0', 'bin')
def
compare_woc_version(ver1: str, ver2: str):
84def compare_woc_version(ver1: str, ver2: str): 85 """ 86 Compare two woc version strings (A < Z < AA). 87 88 >>> compare_woc_version('S', 'T') > 0 89 False 90 >>> compare_woc_version('AA', 'U') > 0 91 True 92 """ 93 if len(ver1) != len(ver2): 94 return len(ver1) - len(ver2) 95 return ord(ver1[0]) - ord(ver2[0])
Compare two woc version strings (A < Z < AA).
>>> compare_woc_version('S', 'T') > 0
False
>>> compare_woc_version('AA', 'U') > 0
True
def
infer_dtype(map_name: str) -> Tuple[str, str]:
98def infer_dtype(map_name: str) -> Tuple[str, str]: 99 """ 100 Infer the data types from the map's name (entity -> entity). 101 102 Should be bug-to-bug compatible with: 103 https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34 104 >>> infer_dtype('c2f') 105 ('h', 'cs') 106 >>> infer_dtype('b2tac') 107 ('h', 'cs3') 108 """ 109 ent_all = map_name.lower() 110 ent_in, ent_out = ent_all.split("2") 111 112 dtype_in, dtype_out = "h", "h" 113 114 if ent_in in ("a", "f", "p"): 115 dtype_in = "s" 116 if ent_out in ("a", "f", "p"): 117 dtype_out = "cs" 118 if ent_in in ("c", "b", "w", "ob", "td"): 119 dtype_in = "h" 120 if ent_out in ("c", "b", "cc", "pc", "ob", "td"): 121 dtype_out = "h" 122 if ent_all in ("b2fa", "c2tag"): 123 dtype_out = "sh" 124 if ent_out in ("ta",): 125 dtype_out = "s" 126 if ent_all in ("b2tk", "td2f"): 127 dtype_out = "s" 128 if ent_all in ("c2h", "c2r"): 129 dtype_out = "r" 130 if ent_in in ("ps", "pf", "pfs"): 131 dtype_in = "s" 132 if ent_out in ("ps", "pf", "pfs"): 133 dtype_out = "s" 134 if ent_out in ("rhp",): 135 dtype_out = "hhwww" 136 if ent_all in ("p2p", "a2a"): 137 dtype_in, dtype_out = "s", "cs" 138 if ent_all in ("b2baddate", "b2manyp"): 139 dtype_in, dtype_out = "s", "h" 140 if ent_all in ("c2fbb", "obb2cf", "bb2cf"): 141 dtype_in, dtype_out = "h", "cs3" 142 if ent_all in ("c2dat",): 143 dtype_in, dtype_out = "h", "s" 144 if ent_all in ("b2tac",): 145 dtype_in, dtype_out = "h", "cs3" 146 147 return dtype_in, dtype_out
Infer the data types from the map's name (entity -> entity).
Should be bug-to-bug compatible with: https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
>>> infer_dtype('c2f')
('h', 'cs')
>>> infer_dtype('b2tac')
('h', 'cs3')
def
detect_profile( paths: Iterable[str], version: Union[str, NoneType] = None, preset_path: Union[str, NoneType] = None, check_missing: bool = True, with_digest: bool = False):
150def detect_profile( 151 paths: Iterable[str], 152 version: Optional[str] = None, 153 preset_path: Optional[str] = None, 154 check_missing: bool = True, 155 with_digest: bool = False, 156): 157 _maps, _objs = {}, {} 158 159 if not preset_path: 160 preset_path = _default_profile 161 162 def _handle_map(src, dst, ver, idx, hash): 163 if version and ver != version: 164 logging.info(f"Found map {f} with version {ver}, expected {version}") 165 return 166 167 _map_name = f"{src}2{dst}" 168 if idx is None: 169 idx = "0" 170 prefix_len = int(idx).bit_length() 171 172 _map = _maps.setdefault(_map_name, {}).setdefault( 173 ver, 174 { 175 "version": ver, 176 "sharding_bits": prefix_len, 177 "shards": {}, 178 "larges": {}, 179 "dtypes": infer_dtype(_map_name), 180 }, 181 ) 182 if not hash: 183 logging.debug(f"Found map {f} with hash {hash} idx {idx}") 184 _map["shards"][int(idx)] = os.path.join(root, f) 185 else: 186 logging.debug(f"Found large map {f} with hash {hash} idx {idx}") 187 _map["larges"][hash] = os.path.join(root, f) 188 _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len) 189 190 def _handle_obj(name, idx, ext): 191 # quirk for "All.sha1" folder 192 # we don't want sha1.blob_*.tch files 193 if root.endswith("All.sha1"): 194 return 195 196 _map_name = f"{name}.{ext}" 197 prefix_len = int(idx).bit_length() if idx else 0 198 _obj = _objs.setdefault( 199 _map_name, 200 { 201 "sharding_bits": prefix_len, 202 "shards": {}, 203 }, 204 ) 205 logging.debug(f"Found obj {f} idx {idx}") 206 _obj["shards"][int(idx)] = os.path.join(root, f) 207 _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len) 208 209 for path in paths: 210 # walk the directory for all files 211 for root, _, files in os.walk(path): 212 # only consider .tch, .idx, .bin files 213 files = [ 214 f 215 for f in files 216 if ".tch" in f 217 or (not f.startswith("pack") and f.endswith(".idx")) 218 or f.endswith(".bin") 219 ] 220 for idx, f in enumerate(tqdm(files, desc=root)): 221 _r = parse_map_fname(f) 222 if _r: 223 src, dst, ver, idx = _r 224 _handle_map(src, dst, ver, idx, None) 225 continue 226 227 _r = parse_large_fname(f) 228 if _r: 229 src, dst, ver, idx, hash = _r 230 _handle_map(src, dst, ver, idx, hash) 231 continue 232 233 _r = parse_obj_fname(f) 234 if _r: 235 name, idx, ext = _r 236 _handle_obj(name, idx, ext) 237 continue 238 _logger.warning(f"Unrecognized file: {f}") 239 240 # transform maps 241 _ls_maps = {} 242 for k, v in _maps.items(): 243 _to_drop = [] 244 for ver, vv in v.items(): 245 # convert shards to list 246 _ls = [None] * 2 ** vv["sharding_bits"] 247 for kkk, vvv in vv["shards"].items(): 248 _ls[kkk] = vvv 249 # see if we can find the None in _ls 250 _nones = [i for i, x in enumerate(_ls) if x is None] 251 if _nones and check_missing: 252 _logger.warning( 253 f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping' 254 ) 255 _logger.warning(f"Got: {vv['shards']}") 256 _to_drop.append(ver) 257 else: 258 vv["shards"] = _ls 259 for ver in _to_drop: 260 del v[ver] 261 262 # move latest maps to the front of the list 263 if len(v) == 0: 264 continue 265 _ls_maps[k] = [ 266 vv 267 for _, vv in sorted( 268 v.items(), 269 key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), 270 reverse=True, 271 ) 272 ] 273 274 # transform objects 275 _ls_objs = {} 276 for k, v in _objs.items(): 277 # convert shards to list 278 _ls = [None] * 2 ** v["sharding_bits"] 279 for kk, vv in v["shards"].items(): 280 _ls[kk] = vv 281 # see if we can find the None in _ls 282 _nones = [i for i, x in enumerate(_ls) if x is None] 283 if _nones and check_missing: 284 _logger.warning( 285 f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping' 286 ) 287 _logger.warning(f"Got: {v['shards']}") 288 else: 289 v["shards"] = _ls 290 _ls_objs[k] = v 291 292 # quirk for da* servers 293 # follow symlinks to avoid the overhead of NFS 294 def _resolve_path(file_path: str) -> str: 295 _resolved = os.path.realpath(file_path) 296 if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved): 297 _logger.warning(f"Resolve {file_path} to {_resolved}") 298 return _resolved 299 return file_path 300 301 # transform to v2 302 _total_files = 0 303 for l_maps in _ls_maps.values(): 304 for _map in l_maps: 305 # iterate over shards 306 _new_shards = [] 307 for shard_path in _map["shards"]: 308 if shard_path is None: 309 continue 310 _new_shards.append( 311 { 312 "path": _resolve_path(shard_path), 313 "size": os.path.getsize(shard_path), 314 "digest": None, 315 } 316 ) 317 _map["shards"] = _new_shards 318 _total_files += len(_new_shards) 319 320 # iterate over larges 321 _new_larges = {} 322 for hash, large_path in _map["larges"].items(): 323 _new_larges[hash] = { 324 "path": _resolve_path(large_path), 325 "size": os.path.getsize(large_path), 326 "digest": None, 327 } 328 _map["larges"] = _new_larges 329 _total_files += len(_new_larges) 330 331 for obj_name, obj in _ls_objs.items(): 332 _new_shards = [] 333 for shard_path in obj["shards"]: 334 if shard_path is None: 335 continue 336 _new_shards.append( 337 { 338 "path": _resolve_path(shard_path), 339 "size": os.path.getsize(shard_path), 340 "digest": None, 341 } 342 ) 343 obj["shards"] = _new_shards 344 _total_files += len(_new_shards) 345 346 # aliases 347 if obj_name == "tree.tch": 348 obj["alias"] = "tree" 349 elif obj_name == "commit.tch": 350 obj["alias"] = "commit" 351 elif obj_name == "sha1.blob.tch": 352 obj["alias"] = "blob" 353 354 if with_digest: 355 with tqdm(total=_total_files, desc="Calculating digests") as pbar: 356 for l_maps in _ls_maps.values(): 357 for _map in l_maps: 358 for shard in _map["shards"]: 359 if shard is None: 360 continue 361 shard["digest"] = sample_md5(shard["path"]) 362 pbar.update(1) 363 for large in _map["larges"].values(): 364 large["digest"] = sample_md5(large["path"]) 365 pbar.update(1) 366 367 for obj in _ls_objs.values(): 368 for shard in obj["shards"]: 369 if shard is None: 370 continue 371 shard["digest"] = sample_md5(shard["path"]) 372 pbar.update(1) 373 374 # load the preset profile 375 with open(preset_path, "r") as f: 376 res = json.load(f) 377 378 res["maps"] = _ls_maps 379 res["objects"] = _ls_objs 380 return res