woc.detect
1#!/usr/bin/env python3 2 3# SPDX-License-Identifier: GPL-3.0-or-later 4# @authors: Runzhi He <rzhe@pku.edu.cn> 5# @date: 2024-01-17 6 7import argparse 8import json 9import logging 10import os 11import re 12from functools import cmp_to_key 13from typing import Iterable, Optional, Tuple 14 15from tqdm import tqdm 16 17from .utils import sample_md5 18 19_default_profile = os.path.join(os.path.dirname(__file__), "wocprofile.default.json") 20_logger = logging.getLogger(__name__) 21_logger.setLevel(logging.INFO) 22 23MAP_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$" 24"""Filename regex for basemap files""" 25_map_pat = re.compile(MAP_REGEX) 26 27 28def parse_map_fname(fname: str): 29 """ 30 Parse basemap filename into (src, dst, ver, idx). 31 32 >>> parse_map_fname('c2fFullR.3.tch') 33 ('c', 'f', 'R', '3') 34 >>> parse_map_fname('c2fFullR.tch') 35 ('c', 'f', 'R', None) 36 """ 37 m = _map_pat.match(fname) 38 if not m or len(m.groups()) != 4: 39 return None 40 return m.groups() 41 42 43LARGE_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch.large.([0-9a-f]+)$" 44"""Filename regex for large basemap files""" 45_large_pat = re.compile(LARGE_REGEX) 46 47 48def parse_large_fname(fname: str): 49 """ 50 Parse basemap filename into (src, dst, ver, idx, hash). 51 52 >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f') 53 ('A', 'c', 'U', '15', '59016a4f') 54 """ 55 m = _large_pat.match(fname) 56 if not m or len(m.groups()) != 5: 57 return None 58 return m.groups() 59 60 61OBJ_REGEX = r"^([\w\.]+)_(\d+).(idx|bin|tch)$" 62"""Filename regex for object files""" 63_obj_pat = re.compile(OBJ_REGEX) 64 65 66def parse_obj_fname(fname: str): 67 """ 68 Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext). 69 70 >>> parse_obj_fname('commit_0.tch') 71 ('commit', '0', 'tch') 72 >>> parse_obj_fname('blob_0.idx') 73 ('blob', '0', 'idx') 74 >>> parse_obj_fname('sha1.blob_0.bin') 75 ('sha1.blob', '0', 'bin') 76 """ 77 m = _obj_pat.match(fname) 78 if not m or len(m.groups()) != 3: 79 return None 80 return m.groups() 81 82 83def compare_woc_version(ver1: str, ver2: str): 84 """ 85 Compare two woc version strings (A < Z < AA). 86 87 >>> compare_woc_version('S', 'T') > 0 88 False 89 >>> compare_woc_version('AA', 'U') > 0 90 True 91 """ 92 if len(ver1) != len(ver2): 93 return len(ver1) - len(ver2) 94 return ord(ver1[0]) - ord(ver2[0]) 95 96 97def infer_dtype(map_name: str) -> Tuple[str, str]: 98 """ 99 Infer the data types from the map's name (entity -> entity). 100 101 Should be bug-to-bug compatible with: 102 https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34 103 >>> infer_dtype('c2f') 104 ('h', 'cs') 105 >>> infer_dtype('b2tac') 106 ('h', 'cs3') 107 """ 108 ent_all = map_name.lower() 109 ent_in, ent_out = ent_all.split("2") 110 111 dtype_in, dtype_out = "h", "h" 112 113 if ent_in in ("a", "f", "p"): 114 dtype_in = "s" 115 if ent_out in ("a", "f", "p"): 116 dtype_out = "cs" 117 if ent_in in ("c", "b", "w", "ob", "td"): 118 dtype_in = "h" 119 if ent_out in ("c", "b", "cc", "pc", "ob", "td"): 120 dtype_out = "h" 121 if ent_all in ("b2fa"): 122 dtype_out = "cs3" 123 if ent_out in ("ta", "tag"): 124 dtype_out = "s" 125 if ent_all in ( 126 "b2tk", 127 "td2f", 128 ): 129 dtype_out = "s" 130 if ent_all in ("c2h", "c2r"): 131 dtype_out = "r" 132 if ent_in in ("ps", "pf", "pfs"): 133 dtype_in = "s" 134 if ent_out in ("ps", "pf", "pfs"): 135 dtype_out = "s" 136 if ent_out in ("rhp",): 137 dtype_out = "hhwww" 138 if ent_all in ("p2p", "a2a"): 139 dtype_in, dtype_out = "s", "cs" 140 if ent_all in ("b2baddate", "b2manyp"): 141 dtype_in, dtype_out = "s", "h" 142 if ent_all in ("c2fbb", "obb2cf", "bb2cf"): 143 dtype_in, dtype_out = "h", "cs3" 144 if ent_all in ("c2dat",): 145 dtype_in, dtype_out = "h", "s" 146 if ent_all in ("b2tac", "b2cff"): 147 dtype_in, dtype_out = "h", "cs3" 148 if ent_all in ("po2pn",): 149 dtype_in, dtype_out = "s", "cs" 150 return dtype_in, dtype_out 151 152 153def detect_profile( 154 paths: Iterable[str], 155 version: Optional[str] = None, 156 preset_path: Optional[str] = None, 157 check_missing: bool = True, 158 with_digest: bool = False, 159): 160 _maps, _objs = {}, {} 161 162 if not preset_path: 163 preset_path = _default_profile 164 165 def _handle_map(src, dst, ver, idx, hash): 166 if version and ver != version: 167 logging.info(f"Found map {f} with version {ver}, expected {version}") 168 return 169 170 _map_name = f"{src}2{dst}" 171 if idx is None: 172 idx = "0" 173 prefix_len = int(idx).bit_length() 174 175 _map = _maps.setdefault(_map_name, {}).setdefault( 176 ver, 177 { 178 "version": ver, 179 "sharding_bits": prefix_len, 180 "shards": {}, 181 "larges": {}, 182 "dtypes": infer_dtype(_map_name), 183 }, 184 ) 185 if not hash: 186 logging.debug(f"Found map {f} with hash {hash} idx {idx}") 187 _map["shards"][int(idx)] = os.path.join(root, f) 188 else: 189 logging.debug(f"Found large map {f} with hash {hash} idx {idx}") 190 _map["larges"][hash] = os.path.join(root, f) 191 _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len) 192 193 def _handle_obj(name, idx, ext): 194 # quirk for "All.sha1" folder 195 # we don't want sha1.blob_*.tch files 196 if root.endswith("All.sha1"): 197 return 198 199 _map_name = f"{name}.{ext}" 200 prefix_len = int(idx).bit_length() if idx else 0 201 _obj = _objs.setdefault( 202 _map_name, 203 { 204 "sharding_bits": prefix_len, 205 "shards": {}, 206 }, 207 ) 208 logging.debug(f"Found obj {f} idx {idx}") 209 _obj["shards"][int(idx)] = os.path.join(root, f) 210 _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len) 211 212 for path in paths: 213 # walk the directory for all files 214 for root, _, files in os.walk(path): 215 # only consider .tch, .idx, .bin files 216 files = [ 217 f 218 for f in files 219 if ".tch" in f 220 or (not f.startswith("pack") and f.endswith(".idx")) 221 or f.endswith(".bin") 222 ] 223 for idx, f in enumerate(tqdm(files, desc=root)): 224 _r = parse_map_fname(f) 225 if _r: 226 src, dst, ver, idx = _r 227 _handle_map(src, dst, ver, idx, None) 228 continue 229 230 _r = parse_large_fname(f) 231 if _r: 232 src, dst, ver, idx, hash = _r 233 _handle_map(src, dst, ver, idx, hash) 234 continue 235 236 _r = parse_obj_fname(f) 237 if _r: 238 name, idx, ext = _r 239 _handle_obj(name, idx, ext) 240 continue 241 _logger.warning(f"Unrecognized file: {f}") 242 243 # transform maps 244 _ls_maps = {} 245 for k, v in _maps.items(): 246 _to_drop = [] 247 for ver, vv in v.items(): 248 # convert shards to list 249 _ls = [None] * 2 ** vv["sharding_bits"] 250 for kkk, vvv in vv["shards"].items(): 251 _ls[kkk] = vvv 252 # see if we can find the None in _ls 253 _nones = [i for i, x in enumerate(_ls) if x is None] 254 if _nones and check_missing: 255 _logger.warning( 256 f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping' 257 ) 258 _logger.warning(f"Got: {vv['shards']}") 259 _to_drop.append(ver) 260 else: 261 vv["shards"] = _ls 262 for ver in _to_drop: 263 del v[ver] 264 265 # move latest maps to the front of the list 266 if len(v) == 0: 267 continue 268 _ls_maps[k] = [ 269 vv 270 for _, vv in sorted( 271 v.items(), 272 key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), 273 reverse=True, 274 ) 275 ] 276 277 # transform objects 278 _ls_objs = {} 279 for k, v in _objs.items(): 280 # convert shards to list 281 _ls = [None] * 2 ** v["sharding_bits"] 282 for kk, vv in v["shards"].items(): 283 _ls[kk] = vv 284 # see if we can find the None in _ls 285 _nones = [i for i, x in enumerate(_ls) if x is None] 286 if _nones and check_missing: 287 _logger.warning( 288 f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping' 289 ) 290 _logger.warning(f"Got: {v['shards']}") 291 else: 292 v["shards"] = _ls 293 _ls_objs[k] = v 294 295 # quirk for da* servers 296 # follow symlinks to avoid the overhead of NFS 297 def _resolve_path(file_path: str) -> str: 298 _resolved = os.path.realpath(file_path) 299 if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved): 300 _logger.warning(f"Resolve {file_path} to {_resolved}") 301 return _resolved 302 return file_path 303 304 # transform to v2 305 _total_files = 0 306 for l_maps in _ls_maps.values(): 307 for _map in l_maps: 308 # iterate over shards 309 _new_shards = [] 310 for shard_path in _map["shards"]: 311 if shard_path is None: 312 continue 313 _new_shards.append( 314 { 315 "path": _resolve_path(shard_path), 316 "size": os.path.getsize(shard_path), 317 "digest": None, 318 } 319 ) 320 _map["shards"] = _new_shards 321 _total_files += len(_new_shards) 322 323 # iterate over larges 324 _new_larges = {} 325 for hash, large_path in _map["larges"].items(): 326 _new_larges[hash] = { 327 "path": _resolve_path(large_path), 328 "size": os.path.getsize(large_path), 329 "digest": None, 330 } 331 _map["larges"] = _new_larges 332 _total_files += len(_new_larges) 333 334 for obj_name, obj in _ls_objs.items(): 335 _new_shards = [] 336 for shard_path in obj["shards"]: 337 if shard_path is None: 338 continue 339 _new_shards.append( 340 { 341 "path": _resolve_path(shard_path), 342 "size": os.path.getsize(shard_path), 343 "digest": None, 344 } 345 ) 346 obj["shards"] = _new_shards 347 _total_files += len(_new_shards) 348 349 # aliases 350 if obj_name == "tree.tch": 351 obj["alias"] = "tree" 352 elif obj_name == "commit.tch": 353 obj["alias"] = "commit" 354 elif obj_name == "sha1.blob.tch": 355 obj["alias"] = "blob" 356 357 if with_digest: 358 with tqdm(total=_total_files, desc="Calculating digests") as pbar: 359 for l_maps in _ls_maps.values(): 360 for _map in l_maps: 361 for shard in _map["shards"]: 362 if shard is None: 363 continue 364 shard["digest"] = sample_md5(shard["path"]) 365 pbar.update(1) 366 for large in _map["larges"].values(): 367 large["digest"] = sample_md5(large["path"]) 368 pbar.update(1) 369 370 for obj in _ls_objs.values(): 371 for shard in obj["shards"]: 372 if shard is None: 373 continue 374 shard["digest"] = sample_md5(shard["path"]) 375 pbar.update(1) 376 377 # load the preset profile 378 with open(preset_path, "r") as f: 379 res = json.load(f) 380 381 res["maps"] = _ls_maps 382 res["objects"] = _ls_objs 383 return res 384 385 386if __name__ == "__main__": 387 import doctest 388 389 doctest.testmod() 390 391 parser = argparse.ArgumentParser(description="Detect woc profile") 392 parser.add_argument( 393 "paths", metavar="PATH", type=str, nargs="+", help="path to woc directory" 394 ) 395 parser.add_argument("--version", type=str, default=None, help="woc mapping version") 396 parser.add_argument( 397 "--preset", type=str, default=_default_profile, help="path to preset profile" 398 ) 399 parser.add_argument("--output", type=str, default=None, help="path to output profile") 400 parser.add_argument( 401 "--no-skip-missing", 402 dest="check_missing", 403 action="store_false", 404 help="do not check missing shards", 405 ) 406 parser.add_argument( 407 "--with-digest", 408 dest="with_digest", 409 action="store_true", 410 help="calculate digest for each file", 411 default=False, 412 ) 413 414 args = parser.parse_args() 415 416 res = detect_profile( 417 args.paths, args.version, args.preset, args.check_missing, args.with_digest 418 ) 419 if args.output: 420 with open(args.output, "w") as f: 421 json.dump(res, f, indent=2) 422 else: 423 print(json.dumps(res, indent=2))
MAP_REGEX =
'^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch$'
Filename regex for basemap files
def
parse_map_fname(fname: str):
29def parse_map_fname(fname: str): 30 """ 31 Parse basemap filename into (src, dst, ver, idx). 32 33 >>> parse_map_fname('c2fFullR.3.tch') 34 ('c', 'f', 'R', '3') 35 >>> parse_map_fname('c2fFullR.tch') 36 ('c', 'f', 'R', None) 37 """ 38 m = _map_pat.match(fname) 39 if not m or len(m.groups()) != 4: 40 return None 41 return m.groups()
Parse basemap filename into (src, dst, ver, idx).
>>> parse_map_fname('c2fFullR.3woc.tch')
('c', 'f', 'R', '3')
>>> parse_map_fname('c2fFullR.tch')
('c', 'f', 'R', None)
LARGE_REGEX =
'^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch.large.([0-9a-f]+)$'
Filename regex for large basemap files
def
parse_large_fname(fname: str):
49def parse_large_fname(fname: str): 50 """ 51 Parse basemap filename into (src, dst, ver, idx, hash). 52 53 >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f') 54 ('A', 'c', 'U', '15', '59016a4f') 55 """ 56 m = _large_pat.match(fname) 57 if not m or len(m.groups()) != 5: 58 return None 59 return m.groups()
Parse basemap filename into (src, dst, ver, idx, hash).
>>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
('A', 'c', 'U', '15', '59016a4f')
OBJ_REGEX =
'^([\\w\\.]+)_(\\d+).(idx|bin|tch)$'
Filename regex for object files
def
parse_obj_fname(fname: str):
67def parse_obj_fname(fname: str): 68 """ 69 Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext). 70 71 >>> parse_obj_fname('commit_0.tch') 72 ('commit', '0', 'tch') 73 >>> parse_obj_fname('blob_0.idx') 74 ('blob', '0', 'idx') 75 >>> parse_obj_fname('sha1.blob_0.bin') 76 ('sha1.blob', '0', 'bin') 77 """ 78 m = _obj_pat.match(fname) 79 if not m or len(m.groups()) != 3: 80 return None 81 return m.groups()
Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
>>> parse_obj_fname('commit_0.tch')
('commit', '0', 'tch')
>>> parse_obj_fname('blob_0.idx')
('blob', '0', 'idx')
>>> parse_obj_fname('sha1.blob_0.bin')
('sha1.blob', '0', 'bin')
def
compare_woc_version(ver1: str, ver2: str):
84def compare_woc_version(ver1: str, ver2: str): 85 """ 86 Compare two woc version strings (A < Z < AA). 87 88 >>> compare_woc_version('S', 'T') > 0 89 False 90 >>> compare_woc_version('AA', 'U') > 0 91 True 92 """ 93 if len(ver1) != len(ver2): 94 return len(ver1) - len(ver2) 95 return ord(ver1[0]) - ord(ver2[0])
Compare two woc version strings (A < Z < AA).
>>> compare_woc_version('S', 'T') > 0
False
>>> compare_woc_version('AA', 'U') > 0
True
def
infer_dtype(map_name: str) -> Tuple[str, str]:
98def infer_dtype(map_name: str) -> Tuple[str, str]: 99 """ 100 Infer the data types from the map's name (entity -> entity). 101 102 Should be bug-to-bug compatible with: 103 https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34 104 >>> infer_dtype('c2f') 105 ('h', 'cs') 106 >>> infer_dtype('b2tac') 107 ('h', 'cs3') 108 """ 109 ent_all = map_name.lower() 110 ent_in, ent_out = ent_all.split("2") 111 112 dtype_in, dtype_out = "h", "h" 113 114 if ent_in in ("a", "f", "p"): 115 dtype_in = "s" 116 if ent_out in ("a", "f", "p"): 117 dtype_out = "cs" 118 if ent_in in ("c", "b", "w", "ob", "td"): 119 dtype_in = "h" 120 if ent_out in ("c", "b", "cc", "pc", "ob", "td"): 121 dtype_out = "h" 122 if ent_all in ("b2fa"): 123 dtype_out = "cs3" 124 if ent_out in ("ta", "tag"): 125 dtype_out = "s" 126 if ent_all in ( 127 "b2tk", 128 "td2f", 129 ): 130 dtype_out = "s" 131 if ent_all in ("c2h", "c2r"): 132 dtype_out = "r" 133 if ent_in in ("ps", "pf", "pfs"): 134 dtype_in = "s" 135 if ent_out in ("ps", "pf", "pfs"): 136 dtype_out = "s" 137 if ent_out in ("rhp",): 138 dtype_out = "hhwww" 139 if ent_all in ("p2p", "a2a"): 140 dtype_in, dtype_out = "s", "cs" 141 if ent_all in ("b2baddate", "b2manyp"): 142 dtype_in, dtype_out = "s", "h" 143 if ent_all in ("c2fbb", "obb2cf", "bb2cf"): 144 dtype_in, dtype_out = "h", "cs3" 145 if ent_all in ("c2dat",): 146 dtype_in, dtype_out = "h", "s" 147 if ent_all in ("b2tac", "b2cff"): 148 dtype_in, dtype_out = "h", "cs3" 149 if ent_all in ("po2pn",): 150 dtype_in, dtype_out = "s", "cs" 151 return dtype_in, dtype_out
Infer the data types from the map's name (entity -> entity).
Should be bug-to-bug compatible with: https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
>>> infer_dtype('c2f')
('h', 'cs')
>>> infer_dtype('b2tac')
('h', 'cs3')
def
detect_profile( paths: Iterable[str], version: Union[str, NoneType] = None, preset_path: Union[str, NoneType] = None, check_missing: bool = True, with_digest: bool = False):
154def detect_profile( 155 paths: Iterable[str], 156 version: Optional[str] = None, 157 preset_path: Optional[str] = None, 158 check_missing: bool = True, 159 with_digest: bool = False, 160): 161 _maps, _objs = {}, {} 162 163 if not preset_path: 164 preset_path = _default_profile 165 166 def _handle_map(src, dst, ver, idx, hash): 167 if version and ver != version: 168 logging.info(f"Found map {f} with version {ver}, expected {version}") 169 return 170 171 _map_name = f"{src}2{dst}" 172 if idx is None: 173 idx = "0" 174 prefix_len = int(idx).bit_length() 175 176 _map = _maps.setdefault(_map_name, {}).setdefault( 177 ver, 178 { 179 "version": ver, 180 "sharding_bits": prefix_len, 181 "shards": {}, 182 "larges": {}, 183 "dtypes": infer_dtype(_map_name), 184 }, 185 ) 186 if not hash: 187 logging.debug(f"Found map {f} with hash {hash} idx {idx}") 188 _map["shards"][int(idx)] = os.path.join(root, f) 189 else: 190 logging.debug(f"Found large map {f} with hash {hash} idx {idx}") 191 _map["larges"][hash] = os.path.join(root, f) 192 _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len) 193 194 def _handle_obj(name, idx, ext): 195 # quirk for "All.sha1" folder 196 # we don't want sha1.blob_*.tch files 197 if root.endswith("All.sha1"): 198 return 199 200 _map_name = f"{name}.{ext}" 201 prefix_len = int(idx).bit_length() if idx else 0 202 _obj = _objs.setdefault( 203 _map_name, 204 { 205 "sharding_bits": prefix_len, 206 "shards": {}, 207 }, 208 ) 209 logging.debug(f"Found obj {f} idx {idx}") 210 _obj["shards"][int(idx)] = os.path.join(root, f) 211 _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len) 212 213 for path in paths: 214 # walk the directory for all files 215 for root, _, files in os.walk(path): 216 # only consider .tch, .idx, .bin files 217 files = [ 218 f 219 for f in files 220 if ".tch" in f 221 or (not f.startswith("pack") and f.endswith(".idx")) 222 or f.endswith(".bin") 223 ] 224 for idx, f in enumerate(tqdm(files, desc=root)): 225 _r = parse_map_fname(f) 226 if _r: 227 src, dst, ver, idx = _r 228 _handle_map(src, dst, ver, idx, None) 229 continue 230 231 _r = parse_large_fname(f) 232 if _r: 233 src, dst, ver, idx, hash = _r 234 _handle_map(src, dst, ver, idx, hash) 235 continue 236 237 _r = parse_obj_fname(f) 238 if _r: 239 name, idx, ext = _r 240 _handle_obj(name, idx, ext) 241 continue 242 _logger.warning(f"Unrecognized file: {f}") 243 244 # transform maps 245 _ls_maps = {} 246 for k, v in _maps.items(): 247 _to_drop = [] 248 for ver, vv in v.items(): 249 # convert shards to list 250 _ls = [None] * 2 ** vv["sharding_bits"] 251 for kkk, vvv in vv["shards"].items(): 252 _ls[kkk] = vvv 253 # see if we can find the None in _ls 254 _nones = [i for i, x in enumerate(_ls) if x is None] 255 if _nones and check_missing: 256 _logger.warning( 257 f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping' 258 ) 259 _logger.warning(f"Got: {vv['shards']}") 260 _to_drop.append(ver) 261 else: 262 vv["shards"] = _ls 263 for ver in _to_drop: 264 del v[ver] 265 266 # move latest maps to the front of the list 267 if len(v) == 0: 268 continue 269 _ls_maps[k] = [ 270 vv 271 for _, vv in sorted( 272 v.items(), 273 key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), 274 reverse=True, 275 ) 276 ] 277 278 # transform objects 279 _ls_objs = {} 280 for k, v in _objs.items(): 281 # convert shards to list 282 _ls = [None] * 2 ** v["sharding_bits"] 283 for kk, vv in v["shards"].items(): 284 _ls[kk] = vv 285 # see if we can find the None in _ls 286 _nones = [i for i, x in enumerate(_ls) if x is None] 287 if _nones and check_missing: 288 _logger.warning( 289 f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping' 290 ) 291 _logger.warning(f"Got: {v['shards']}") 292 else: 293 v["shards"] = _ls 294 _ls_objs[k] = v 295 296 # quirk for da* servers 297 # follow symlinks to avoid the overhead of NFS 298 def _resolve_path(file_path: str) -> str: 299 _resolved = os.path.realpath(file_path) 300 if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved): 301 _logger.warning(f"Resolve {file_path} to {_resolved}") 302 return _resolved 303 return file_path 304 305 # transform to v2 306 _total_files = 0 307 for l_maps in _ls_maps.values(): 308 for _map in l_maps: 309 # iterate over shards 310 _new_shards = [] 311 for shard_path in _map["shards"]: 312 if shard_path is None: 313 continue 314 _new_shards.append( 315 { 316 "path": _resolve_path(shard_path), 317 "size": os.path.getsize(shard_path), 318 "digest": None, 319 } 320 ) 321 _map["shards"] = _new_shards 322 _total_files += len(_new_shards) 323 324 # iterate over larges 325 _new_larges = {} 326 for hash, large_path in _map["larges"].items(): 327 _new_larges[hash] = { 328 "path": _resolve_path(large_path), 329 "size": os.path.getsize(large_path), 330 "digest": None, 331 } 332 _map["larges"] = _new_larges 333 _total_files += len(_new_larges) 334 335 for obj_name, obj in _ls_objs.items(): 336 _new_shards = [] 337 for shard_path in obj["shards"]: 338 if shard_path is None: 339 continue 340 _new_shards.append( 341 { 342 "path": _resolve_path(shard_path), 343 "size": os.path.getsize(shard_path), 344 "digest": None, 345 } 346 ) 347 obj["shards"] = _new_shards 348 _total_files += len(_new_shards) 349 350 # aliases 351 if obj_name == "tree.tch": 352 obj["alias"] = "tree" 353 elif obj_name == "commit.tch": 354 obj["alias"] = "commit" 355 elif obj_name == "sha1.blob.tch": 356 obj["alias"] = "blob" 357 358 if with_digest: 359 with tqdm(total=_total_files, desc="Calculating digests") as pbar: 360 for l_maps in _ls_maps.values(): 361 for _map in l_maps: 362 for shard in _map["shards"]: 363 if shard is None: 364 continue 365 shard["digest"] = sample_md5(shard["path"]) 366 pbar.update(1) 367 for large in _map["larges"].values(): 368 large["digest"] = sample_md5(large["path"]) 369 pbar.update(1) 370 371 for obj in _ls_objs.values(): 372 for shard in obj["shards"]: 373 if shard is None: 374 continue 375 shard["digest"] = sample_md5(shard["path"]) 376 pbar.update(1) 377 378 # load the preset profile 379 with open(preset_path, "r") as f: 380 res = json.load(f) 381 382 res["maps"] = _ls_maps 383 res["objects"] = _ls_objs 384 return res