woc.detect

  1#!/usr/bin/env python3
  2
  3# SPDX-License-Identifier: GPL-3.0-or-later
  4# @authors: Runzhi He <rzhe@pku.edu.cn>
  5# @date: 2024-01-17
  6
  7import argparse
  8import json
  9import logging
 10import os
 11import re
 12from functools import cmp_to_key
 13from typing import Iterable, Optional, Tuple
 14
 15from tqdm import tqdm
 16
 17from .utils import sample_md5
 18
 19_default_profile = os.path.join(os.path.dirname(__file__), "wocprofile.default.json")
 20_logger = logging.getLogger(__name__)
 21_logger.setLevel(logging.INFO)
 22
 23MAP_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$"
 24"""Filename regex for basemap files"""
 25_map_pat = re.compile(MAP_REGEX)
 26
 27
 28def parse_map_fname(fname: str):
 29    """
 30    Parse basemap filename into (src, dst, ver, idx).
 31
 32    >>> parse_map_fname('c2fFullR.3.tch')
 33    ('c', 'f', 'R', '3')
 34    >>> parse_map_fname('c2fFullR.tch')
 35    ('c', 'f', 'R', None)
 36    """
 37    m = _map_pat.match(fname)
 38    if not m or len(m.groups()) != 4:
 39        return None
 40    return m.groups()
 41
 42
 43LARGE_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch.large.([0-9a-f]+)$"
 44"""Filename regex for large basemap files"""
 45_large_pat = re.compile(LARGE_REGEX)
 46
 47
 48def parse_large_fname(fname: str):
 49    """
 50    Parse basemap filename into (src, dst, ver, idx, hash).
 51
 52    >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
 53    ('A', 'c', 'U', '15', '59016a4f')
 54    """
 55    m = _large_pat.match(fname)
 56    if not m or len(m.groups()) != 5:
 57        return None
 58    return m.groups()
 59
 60
 61OBJ_REGEX = r"^([\w\.]+)_(\d+).(idx|bin|tch)$"
 62"""Filename regex for object files"""
 63_obj_pat = re.compile(OBJ_REGEX)
 64
 65
 66def parse_obj_fname(fname: str):
 67    """
 68    Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
 69
 70    >>> parse_obj_fname('commit_0.tch')
 71    ('commit', '0', 'tch')
 72    >>> parse_obj_fname('blob_0.idx')
 73    ('blob', '0', 'idx')
 74    >>> parse_obj_fname('sha1.blob_0.bin')
 75    ('sha1.blob', '0', 'bin')
 76    """
 77    m = _obj_pat.match(fname)
 78    if not m or len(m.groups()) != 3:
 79        return None
 80    return m.groups()
 81
 82
 83def compare_woc_version(ver1: str, ver2: str):
 84    """
 85    Compare two woc version strings (A < Z < AA).
 86
 87    >>> compare_woc_version('S', 'T') > 0
 88    False
 89    >>> compare_woc_version('AA', 'U') > 0
 90    True
 91    """
 92    if len(ver1) != len(ver2):
 93        return len(ver1) - len(ver2)
 94    return ord(ver1[0]) - ord(ver2[0])
 95
 96
 97def infer_dtype(map_name: str) -> Tuple[str, str]:
 98    """
 99    Infer the data types from the map's name (entity -> entity).
100
101    Should be bug-to-bug compatible with:
102    https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
103    >>> infer_dtype('c2f')
104    ('h', 'cs')
105    >>> infer_dtype('b2tac')
106    ('h', 'cs3')
107    """
108    ent_all = map_name.lower()
109    ent_in, ent_out = ent_all.split("2")
110
111    dtype_in, dtype_out = "h", "h"
112
113    if ent_in in ("a", "f", "p"):
114        dtype_in = "s"
115    if ent_out in ("a", "f", "p"):
116        dtype_out = "cs"
117    if ent_in in ("c", "b", "w", "ob", "td"):
118        dtype_in = "h"
119    if ent_out in ("c", "b", "cc", "pc", "ob", "td"):
120        dtype_out = "h"
121    if ent_all in ("b2fa"):
122        dtype_out = "cs3"
123    if ent_out in ("ta", "tag"):
124        dtype_out = "s"
125    if ent_all in (
126        "b2tk",
127        "td2f",
128    ):
129        dtype_out = "s"
130    if ent_all in ("c2h", "c2r"):
131        dtype_out = "r"
132    if ent_in in ("ps", "pf", "pfs"):
133        dtype_in = "s"
134    if ent_out in ("ps", "pf", "pfs"):
135        dtype_out = "s"
136    if ent_out in ("rhp",):
137        dtype_out = "hhwww"
138    if ent_all in ("p2p", "a2a"):
139        dtype_in, dtype_out = "s", "cs"
140    if ent_all in ("b2baddate", "b2manyp"):
141        dtype_in, dtype_out = "s", "h"
142    if ent_all in ("c2fbb", "obb2cf", "bb2cf"):
143        dtype_in, dtype_out = "h", "cs3"
144    if ent_all in ("c2dat",):
145        dtype_in, dtype_out = "h", "s"
146    if ent_all in ("b2tac", "b2cff"):
147        dtype_in, dtype_out = "h", "cs3"
148    if ent_all in ("po2pn",):
149        dtype_in, dtype_out = "s", "cs"
150    return dtype_in, dtype_out
151
152
153def detect_profile(
154    paths: Iterable[str],
155    version: Optional[str] = None,
156    preset_path: Optional[str] = None,
157    check_missing: bool = True,
158    with_digest: bool = False,
159):
160    _maps, _objs = {}, {}
161
162    if not preset_path:
163        preset_path = _default_profile
164
165    def _handle_map(src, dst, ver, idx, hash):
166        if version and ver != version:
167            logging.info(f"Found map {f} with version {ver}, expected {version}")
168            return
169
170        _map_name = f"{src}2{dst}"
171        if idx is None:
172            idx = "0"
173        prefix_len = int(idx).bit_length()
174
175        _map = _maps.setdefault(_map_name, {}).setdefault(
176            ver,
177            {
178                "version": ver,
179                "sharding_bits": prefix_len,
180                "shards": {},
181                "larges": {},
182                "dtypes": infer_dtype(_map_name),
183            },
184        )
185        if not hash:
186            logging.debug(f"Found map {f} with hash {hash} idx {idx}")
187            _map["shards"][int(idx)] = os.path.join(root, f)
188        else:
189            logging.debug(f"Found large map {f} with hash {hash} idx {idx}")
190            _map["larges"][hash] = os.path.join(root, f)
191        _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len)
192
193    def _handle_obj(name, idx, ext):
194        # quirk for "All.sha1" folder
195        # we don't want sha1.blob_*.tch files
196        if root.endswith("All.sha1"):
197            return
198
199        _map_name = f"{name}.{ext}"
200        prefix_len = int(idx).bit_length() if idx else 0
201        _obj = _objs.setdefault(
202            _map_name,
203            {
204                "sharding_bits": prefix_len,
205                "shards": {},
206            },
207        )
208        logging.debug(f"Found obj {f} idx {idx}")
209        _obj["shards"][int(idx)] = os.path.join(root, f)
210        _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len)
211
212    for path in paths:
213        # walk the directory for all files
214        for root, _, files in os.walk(path):
215            # only consider .tch, .idx, .bin files
216            files = [
217                f
218                for f in files
219                if ".tch" in f
220                or (not f.startswith("pack") and f.endswith(".idx"))
221                or f.endswith(".bin")
222            ]
223            for idx, f in enumerate(tqdm(files, desc=root)):
224                _r = parse_map_fname(f)
225                if _r:
226                    src, dst, ver, idx = _r
227                    _handle_map(src, dst, ver, idx, None)
228                    continue
229
230                _r = parse_large_fname(f)
231                if _r:
232                    src, dst, ver, idx, hash = _r
233                    _handle_map(src, dst, ver, idx, hash)
234                    continue
235
236                _r = parse_obj_fname(f)
237                if _r:
238                    name, idx, ext = _r
239                    _handle_obj(name, idx, ext)
240                    continue
241                _logger.warning(f"Unrecognized file: {f}")
242
243    # transform maps
244    _ls_maps = {}
245    for k, v in _maps.items():
246        _to_drop = []
247        for ver, vv in v.items():
248            # convert shards to list
249            _ls = [None] * 2 ** vv["sharding_bits"]
250            for kkk, vvv in vv["shards"].items():
251                _ls[kkk] = vvv
252            # see if we can find the None in _ls
253            _nones = [i for i, x in enumerate(_ls) if x is None]
254            if _nones and check_missing:
255                _logger.warning(
256                    f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping'
257                )
258                _logger.warning(f"Got: {vv['shards']}")
259                _to_drop.append(ver)
260            else:
261                vv["shards"] = _ls
262        for ver in _to_drop:
263            del v[ver]
264
265        # move latest maps to the front of the list
266        if len(v) == 0:
267            continue
268        _ls_maps[k] = [
269            vv
270            for _, vv in sorted(
271                v.items(),
272                key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])),
273                reverse=True,
274            )
275        ]
276
277    # transform objects
278    _ls_objs = {}
279    for k, v in _objs.items():
280        # convert shards to list
281        _ls = [None] * 2 ** v["sharding_bits"]
282        for kk, vv in v["shards"].items():
283            _ls[kk] = vv
284        # see if we can find the None in _ls
285        _nones = [i for i, x in enumerate(_ls) if x is None]
286        if _nones and check_missing:
287            _logger.warning(
288                f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping'
289            )
290            _logger.warning(f"Got: {v['shards']}")
291        else:
292            v["shards"] = _ls
293            _ls_objs[k] = v
294
295    # quirk for da* servers
296    # follow symlinks to avoid the overhead of NFS
297    def _resolve_path(file_path: str) -> str:
298        _resolved = os.path.realpath(file_path)
299        if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved):
300            _logger.warning(f"Resolve {file_path} to {_resolved}")
301            return _resolved
302        return file_path
303
304    # transform to v2
305    _total_files = 0
306    for l_maps in _ls_maps.values():
307        for _map in l_maps:
308            # iterate over shards
309            _new_shards = []
310            for shard_path in _map["shards"]:
311                if shard_path is None:
312                    continue
313                _new_shards.append(
314                    {
315                        "path": _resolve_path(shard_path),
316                        "size": os.path.getsize(shard_path),
317                        "digest": None,
318                    }
319                )
320            _map["shards"] = _new_shards
321            _total_files += len(_new_shards)
322
323            # iterate over larges
324            _new_larges = {}
325            for hash, large_path in _map["larges"].items():
326                _new_larges[hash] = {
327                    "path": _resolve_path(large_path),
328                    "size": os.path.getsize(large_path),
329                    "digest": None,
330                }
331            _map["larges"] = _new_larges
332            _total_files += len(_new_larges)
333
334    for obj_name, obj in _ls_objs.items():
335        _new_shards = []
336        for shard_path in obj["shards"]:
337            if shard_path is None:
338                continue
339            _new_shards.append(
340                {
341                    "path": _resolve_path(shard_path),
342                    "size": os.path.getsize(shard_path),
343                    "digest": None,
344                }
345            )
346        obj["shards"] = _new_shards
347        _total_files += len(_new_shards)
348
349        # aliases
350        if obj_name == "tree.tch":
351            obj["alias"] = "tree"
352        elif obj_name == "commit.tch":
353            obj["alias"] = "commit"
354        elif obj_name == "sha1.blob.tch":
355            obj["alias"] = "blob"
356
357    if with_digest:
358        with tqdm(total=_total_files, desc="Calculating digests") as pbar:
359            for l_maps in _ls_maps.values():
360                for _map in l_maps:
361                    for shard in _map["shards"]:
362                        if shard is None:
363                            continue
364                        shard["digest"] = sample_md5(shard["path"])
365                        pbar.update(1)
366                    for large in _map["larges"].values():
367                        large["digest"] = sample_md5(large["path"])
368                        pbar.update(1)
369
370            for obj in _ls_objs.values():
371                for shard in obj["shards"]:
372                    if shard is None:
373                        continue
374                    shard["digest"] = sample_md5(shard["path"])
375                    pbar.update(1)
376
377    # load the preset profile
378    with open(preset_path, "r") as f:
379        res = json.load(f)
380
381    res["maps"] = _ls_maps
382    res["objects"] = _ls_objs
383    return res
384
385
386if __name__ == "__main__":
387    import doctest
388
389    doctest.testmod()
390
391    parser = argparse.ArgumentParser(description="Detect woc profile")
392    parser.add_argument(
393        "paths", metavar="PATH", type=str, nargs="+", help="path to woc directory"
394    )
395    parser.add_argument("--version", type=str, default=None, help="woc mapping version")
396    parser.add_argument(
397        "--preset", type=str, default=_default_profile, help="path to preset profile"
398    )
399    parser.add_argument("--output", type=str, default=None, help="path to output profile")
400    parser.add_argument(
401        "--no-skip-missing",
402        dest="check_missing",
403        action="store_false",
404        help="do not check missing shards",
405    )
406    parser.add_argument(
407        "--with-digest",
408        dest="with_digest",
409        action="store_true",
410        help="calculate digest for each file",
411        default=False,
412    )
413
414    args = parser.parse_args()
415
416    res = detect_profile(
417        args.paths, args.version, args.preset, args.check_missing, args.with_digest
418    )
419    if args.output:
420        with open(args.output, "w") as f:
421            json.dump(res, f, indent=2)
422    else:
423        print(json.dumps(res, indent=2))
MAP_REGEX = '^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch$'

Filename regex for basemap files

def parse_map_fname(fname: str):
29def parse_map_fname(fname: str):
30    """
31    Parse basemap filename into (src, dst, ver, idx).
32
33    >>> parse_map_fname('c2fFullR.3.tch')
34    ('c', 'f', 'R', '3')
35    >>> parse_map_fname('c2fFullR.tch')
36    ('c', 'f', 'R', None)
37    """
38    m = _map_pat.match(fname)
39    if not m or len(m.groups()) != 4:
40        return None
41    return m.groups()

Parse basemap filename into (src, dst, ver, idx).

>>> parse_map_fname('c2fFullR.3woc.tch')
('c', 'f', 'R', '3')
>>> parse_map_fname('c2fFullR.tch')
('c', 'f', 'R', None)
LARGE_REGEX = '^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch.large.([0-9a-f]+)$'

Filename regex for large basemap files

def parse_large_fname(fname: str):
49def parse_large_fname(fname: str):
50    """
51    Parse basemap filename into (src, dst, ver, idx, hash).
52
53    >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
54    ('A', 'c', 'U', '15', '59016a4f')
55    """
56    m = _large_pat.match(fname)
57    if not m or len(m.groups()) != 5:
58        return None
59    return m.groups()

Parse basemap filename into (src, dst, ver, idx, hash).

>>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
('A', 'c', 'U', '15', '59016a4f')
OBJ_REGEX = '^([\\w\\.]+)_(\\d+).(idx|bin|tch)$'

Filename regex for object files

def parse_obj_fname(fname: str):
67def parse_obj_fname(fname: str):
68    """
69    Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
70
71    >>> parse_obj_fname('commit_0.tch')
72    ('commit', '0', 'tch')
73    >>> parse_obj_fname('blob_0.idx')
74    ('blob', '0', 'idx')
75    >>> parse_obj_fname('sha1.blob_0.bin')
76    ('sha1.blob', '0', 'bin')
77    """
78    m = _obj_pat.match(fname)
79    if not m or len(m.groups()) != 3:
80        return None
81    return m.groups()

Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).

>>> parse_obj_fname('commit_0.tch')
('commit', '0', 'tch')
>>> parse_obj_fname('blob_0.idx')
('blob', '0', 'idx')
>>> parse_obj_fname('sha1.blob_0.bin')
('sha1.blob', '0', 'bin')
def compare_woc_version(ver1: str, ver2: str):
84def compare_woc_version(ver1: str, ver2: str):
85    """
86    Compare two woc version strings (A < Z < AA).
87
88    >>> compare_woc_version('S', 'T') > 0
89    False
90    >>> compare_woc_version('AA', 'U') > 0
91    True
92    """
93    if len(ver1) != len(ver2):
94        return len(ver1) - len(ver2)
95    return ord(ver1[0]) - ord(ver2[0])

Compare two woc version strings (A < Z < AA).

>>> compare_woc_version('S', 'T') > 0
False
>>> compare_woc_version('AA', 'U') > 0
True
def infer_dtype(map_name: str) -> Tuple[str, str]:
 98def infer_dtype(map_name: str) -> Tuple[str, str]:
 99    """
100    Infer the data types from the map's name (entity -> entity).
101
102    Should be bug-to-bug compatible with:
103    https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
104    >>> infer_dtype('c2f')
105    ('h', 'cs')
106    >>> infer_dtype('b2tac')
107    ('h', 'cs3')
108    """
109    ent_all = map_name.lower()
110    ent_in, ent_out = ent_all.split("2")
111
112    dtype_in, dtype_out = "h", "h"
113
114    if ent_in in ("a", "f", "p"):
115        dtype_in = "s"
116    if ent_out in ("a", "f", "p"):
117        dtype_out = "cs"
118    if ent_in in ("c", "b", "w", "ob", "td"):
119        dtype_in = "h"
120    if ent_out in ("c", "b", "cc", "pc", "ob", "td"):
121        dtype_out = "h"
122    if ent_all in ("b2fa"):
123        dtype_out = "cs3"
124    if ent_out in ("ta", "tag"):
125        dtype_out = "s"
126    if ent_all in (
127        "b2tk",
128        "td2f",
129    ):
130        dtype_out = "s"
131    if ent_all in ("c2h", "c2r"):
132        dtype_out = "r"
133    if ent_in in ("ps", "pf", "pfs"):
134        dtype_in = "s"
135    if ent_out in ("ps", "pf", "pfs"):
136        dtype_out = "s"
137    if ent_out in ("rhp",):
138        dtype_out = "hhwww"
139    if ent_all in ("p2p", "a2a"):
140        dtype_in, dtype_out = "s", "cs"
141    if ent_all in ("b2baddate", "b2manyp"):
142        dtype_in, dtype_out = "s", "h"
143    if ent_all in ("c2fbb", "obb2cf", "bb2cf"):
144        dtype_in, dtype_out = "h", "cs3"
145    if ent_all in ("c2dat",):
146        dtype_in, dtype_out = "h", "s"
147    if ent_all in ("b2tac", "b2cff"):
148        dtype_in, dtype_out = "h", "cs3"
149    if ent_all in ("po2pn",):
150        dtype_in, dtype_out = "s", "cs"
151    return dtype_in, dtype_out

Infer the data types from the map's name (entity -> entity).

Should be bug-to-bug compatible with: https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34

>>> infer_dtype('c2f')
('h', 'cs')
>>> infer_dtype('b2tac')
('h', 'cs3')
def detect_profile( paths: Iterable[str], version: Union[str, NoneType] = None, preset_path: Union[str, NoneType] = None, check_missing: bool = True, with_digest: bool = False):
154def detect_profile(
155    paths: Iterable[str],
156    version: Optional[str] = None,
157    preset_path: Optional[str] = None,
158    check_missing: bool = True,
159    with_digest: bool = False,
160):
161    _maps, _objs = {}, {}
162
163    if not preset_path:
164        preset_path = _default_profile
165
166    def _handle_map(src, dst, ver, idx, hash):
167        if version and ver != version:
168            logging.info(f"Found map {f} with version {ver}, expected {version}")
169            return
170
171        _map_name = f"{src}2{dst}"
172        if idx is None:
173            idx = "0"
174        prefix_len = int(idx).bit_length()
175
176        _map = _maps.setdefault(_map_name, {}).setdefault(
177            ver,
178            {
179                "version": ver,
180                "sharding_bits": prefix_len,
181                "shards": {},
182                "larges": {},
183                "dtypes": infer_dtype(_map_name),
184            },
185        )
186        if not hash:
187            logging.debug(f"Found map {f} with hash {hash} idx {idx}")
188            _map["shards"][int(idx)] = os.path.join(root, f)
189        else:
190            logging.debug(f"Found large map {f} with hash {hash} idx {idx}")
191            _map["larges"][hash] = os.path.join(root, f)
192        _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len)
193
194    def _handle_obj(name, idx, ext):
195        # quirk for "All.sha1" folder
196        # we don't want sha1.blob_*.tch files
197        if root.endswith("All.sha1"):
198            return
199
200        _map_name = f"{name}.{ext}"
201        prefix_len = int(idx).bit_length() if idx else 0
202        _obj = _objs.setdefault(
203            _map_name,
204            {
205                "sharding_bits": prefix_len,
206                "shards": {},
207            },
208        )
209        logging.debug(f"Found obj {f} idx {idx}")
210        _obj["shards"][int(idx)] = os.path.join(root, f)
211        _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len)
212
213    for path in paths:
214        # walk the directory for all files
215        for root, _, files in os.walk(path):
216            # only consider .tch, .idx, .bin files
217            files = [
218                f
219                for f in files
220                if ".tch" in f
221                or (not f.startswith("pack") and f.endswith(".idx"))
222                or f.endswith(".bin")
223            ]
224            for idx, f in enumerate(tqdm(files, desc=root)):
225                _r = parse_map_fname(f)
226                if _r:
227                    src, dst, ver, idx = _r
228                    _handle_map(src, dst, ver, idx, None)
229                    continue
230
231                _r = parse_large_fname(f)
232                if _r:
233                    src, dst, ver, idx, hash = _r
234                    _handle_map(src, dst, ver, idx, hash)
235                    continue
236
237                _r = parse_obj_fname(f)
238                if _r:
239                    name, idx, ext = _r
240                    _handle_obj(name, idx, ext)
241                    continue
242                _logger.warning(f"Unrecognized file: {f}")
243
244    # transform maps
245    _ls_maps = {}
246    for k, v in _maps.items():
247        _to_drop = []
248        for ver, vv in v.items():
249            # convert shards to list
250            _ls = [None] * 2 ** vv["sharding_bits"]
251            for kkk, vvv in vv["shards"].items():
252                _ls[kkk] = vvv
253            # see if we can find the None in _ls
254            _nones = [i for i, x in enumerate(_ls) if x is None]
255            if _nones and check_missing:
256                _logger.warning(
257                    f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping'
258                )
259                _logger.warning(f"Got: {vv['shards']}")
260                _to_drop.append(ver)
261            else:
262                vv["shards"] = _ls
263        for ver in _to_drop:
264            del v[ver]
265
266        # move latest maps to the front of the list
267        if len(v) == 0:
268            continue
269        _ls_maps[k] = [
270            vv
271            for _, vv in sorted(
272                v.items(),
273                key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])),
274                reverse=True,
275            )
276        ]
277
278    # transform objects
279    _ls_objs = {}
280    for k, v in _objs.items():
281        # convert shards to list
282        _ls = [None] * 2 ** v["sharding_bits"]
283        for kk, vv in v["shards"].items():
284            _ls[kk] = vv
285        # see if we can find the None in _ls
286        _nones = [i for i, x in enumerate(_ls) if x is None]
287        if _nones and check_missing:
288            _logger.warning(
289                f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping'
290            )
291            _logger.warning(f"Got: {v['shards']}")
292        else:
293            v["shards"] = _ls
294            _ls_objs[k] = v
295
296    # quirk for da* servers
297    # follow symlinks to avoid the overhead of NFS
298    def _resolve_path(file_path: str) -> str:
299        _resolved = os.path.realpath(file_path)
300        if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved):
301            _logger.warning(f"Resolve {file_path} to {_resolved}")
302            return _resolved
303        return file_path
304
305    # transform to v2
306    _total_files = 0
307    for l_maps in _ls_maps.values():
308        for _map in l_maps:
309            # iterate over shards
310            _new_shards = []
311            for shard_path in _map["shards"]:
312                if shard_path is None:
313                    continue
314                _new_shards.append(
315                    {
316                        "path": _resolve_path(shard_path),
317                        "size": os.path.getsize(shard_path),
318                        "digest": None,
319                    }
320                )
321            _map["shards"] = _new_shards
322            _total_files += len(_new_shards)
323
324            # iterate over larges
325            _new_larges = {}
326            for hash, large_path in _map["larges"].items():
327                _new_larges[hash] = {
328                    "path": _resolve_path(large_path),
329                    "size": os.path.getsize(large_path),
330                    "digest": None,
331                }
332            _map["larges"] = _new_larges
333            _total_files += len(_new_larges)
334
335    for obj_name, obj in _ls_objs.items():
336        _new_shards = []
337        for shard_path in obj["shards"]:
338            if shard_path is None:
339                continue
340            _new_shards.append(
341                {
342                    "path": _resolve_path(shard_path),
343                    "size": os.path.getsize(shard_path),
344                    "digest": None,
345                }
346            )
347        obj["shards"] = _new_shards
348        _total_files += len(_new_shards)
349
350        # aliases
351        if obj_name == "tree.tch":
352            obj["alias"] = "tree"
353        elif obj_name == "commit.tch":
354            obj["alias"] = "commit"
355        elif obj_name == "sha1.blob.tch":
356            obj["alias"] = "blob"
357
358    if with_digest:
359        with tqdm(total=_total_files, desc="Calculating digests") as pbar:
360            for l_maps in _ls_maps.values():
361                for _map in l_maps:
362                    for shard in _map["shards"]:
363                        if shard is None:
364                            continue
365                        shard["digest"] = sample_md5(shard["path"])
366                        pbar.update(1)
367                    for large in _map["larges"].values():
368                        large["digest"] = sample_md5(large["path"])
369                        pbar.update(1)
370
371            for obj in _ls_objs.values():
372                for shard in obj["shards"]:
373                    if shard is None:
374                        continue
375                    shard["digest"] = sample_md5(shard["path"])
376                    pbar.update(1)
377
378    # load the preset profile
379    with open(preset_path, "r") as f:
380        res = json.load(f)
381
382    res["maps"] = _ls_maps
383    res["objects"] = _ls_objs
384    return res