woc.detect

  1#!/usr/bin/env python3
  2
  3# SPDX-License-Identifier: GPL-3.0-or-later
  4# @authors: Runzhi He <rzhe@pku.edu.cn>
  5# @date: 2024-01-17
  6
  7import argparse
  8import json
  9import logging
 10import os
 11import re
 12from functools import cmp_to_key
 13from typing import Iterable, Optional, Tuple
 14
 15from tqdm import tqdm
 16
 17from .utils import sample_md5
 18
 19_default_profile = os.path.join(os.path.dirname(__file__), "wocprofile.default.json")
 20_logger = logging.getLogger(__name__)
 21_logger.setLevel(logging.INFO)
 22
 23MAP_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$"
 24"""Filename regex for basemap files"""
 25_map_pat = re.compile(MAP_REGEX)
 26
 27
 28def parse_map_fname(fname: str):
 29    """
 30    Parse basemap filename into (src, dst, ver, idx).
 31
 32    >>> parse_map_fname('c2fFullR.3.tch')
 33    ('c', 'f', 'R', '3')
 34    >>> parse_map_fname('c2fFullR.tch')
 35    ('c', 'f', 'R', None)
 36    """
 37    m = _map_pat.match(fname)
 38    if not m or len(m.groups()) != 4:
 39        return None
 40    return m.groups()
 41
 42
 43LARGE_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch.large.([0-9a-f]+)$"
 44"""Filename regex for large basemap files"""
 45_large_pat = re.compile(LARGE_REGEX)
 46
 47
 48def parse_large_fname(fname: str):
 49    """
 50    Parse basemap filename into (src, dst, ver, idx, hash).
 51
 52    >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
 53    ('A', 'c', 'U', '15', '59016a4f')
 54    """
 55    m = _large_pat.match(fname)
 56    if not m or len(m.groups()) != 5:
 57        return None
 58    return m.groups()
 59
 60
 61OBJ_REGEX = r"^([\w\.]+)_(\d+).(idx|bin|tch)$"
 62"""Filename regex for object files"""
 63_obj_pat = re.compile(OBJ_REGEX)
 64
 65
 66def parse_obj_fname(fname: str):
 67    """
 68    Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
 69
 70    >>> parse_obj_fname('commit_0.tch')
 71    ('commit', '0', 'tch')
 72    >>> parse_obj_fname('blob_0.idx')
 73    ('blob', '0', 'idx')
 74    >>> parse_obj_fname('sha1.blob_0.bin')
 75    ('sha1.blob', '0', 'bin')
 76    """
 77    m = _obj_pat.match(fname)
 78    if not m or len(m.groups()) != 3:
 79        return None
 80    return m.groups()
 81
 82
 83def compare_woc_version(ver1: str, ver2: str):
 84    """
 85    Compare two woc version strings (A < Z < AA).
 86
 87    >>> compare_woc_version('S', 'T') > 0
 88    False
 89    >>> compare_woc_version('AA', 'U') > 0
 90    True
 91    """
 92    if len(ver1) != len(ver2):
 93        return len(ver1) - len(ver2)
 94    return ord(ver1[0]) - ord(ver2[0])
 95
 96
 97def infer_dtype(map_name: str) -> Tuple[str, str]:
 98    """
 99    Infer the data types from the map's name (entity -> entity).
100
101    Should be bug-to-bug compatible with:
102    https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
103    >>> infer_dtype('c2f')
104    ('h', 'cs')
105    >>> infer_dtype('b2tac')
106    ('h', 'cs3')
107    """
108    ent_all = map_name.lower()
109    ent_in, ent_out = ent_all.split("2")
110
111    dtype_in, dtype_out = "h", "h"
112
113    if ent_in in ("a", "f", "p"):
114        dtype_in = "s"
115    if ent_out in ("a", "f", "p"):
116        dtype_out = "cs"
117    if ent_in in ("c", "b", "w", "ob", "td"):
118        dtype_in = "h"
119    if ent_out in ("c", "b", "cc", "pc", "ob", "td"):
120        dtype_out = "h"
121    if ent_all in ("b2fa", "c2tag"):
122        dtype_out = "sh"
123    if ent_out in ("ta",):
124        dtype_out = "s"
125    if ent_all in ("b2tk", "td2f"):
126        dtype_out = "s"
127    if ent_all in ("c2h", "c2r"):
128        dtype_out = "r"
129    if ent_in in ("ps", "pf", "pfs"):
130        dtype_in = "s"
131    if ent_out in ("ps", "pf", "pfs"):
132        dtype_out = "s"
133    if ent_out in ("rhp",):
134        dtype_out = "hhwww"
135    if ent_all in ("p2p", "a2a"):
136        dtype_in, dtype_out = "s", "cs"
137    if ent_all in ("b2baddate", "b2manyp"):
138        dtype_in, dtype_out = "s", "h"
139    if ent_all in ("c2fbb", "obb2cf", "bb2cf"):
140        dtype_in, dtype_out = "h", "cs3"
141    if ent_all in ("c2dat",):
142        dtype_in, dtype_out = "h", "s"
143    if ent_all in ("b2tac",):
144        dtype_in, dtype_out = "h", "cs3"
145
146    return dtype_in, dtype_out
147
148
149def detect_profile(
150    paths: Iterable[str],
151    version: Optional[str] = None,
152    preset_path: Optional[str] = None,
153    check_missing: bool = True,
154    with_digest: bool = False,
155):
156    _maps, _objs = {}, {}
157
158    if not preset_path:
159        preset_path = _default_profile
160
161    def _handle_map(src, dst, ver, idx, hash):
162        if version and ver != version:
163            logging.info(f"Found map {f} with version {ver}, expected {version}")
164            return
165
166        _map_name = f"{src}2{dst}"
167        if idx is None:
168            idx = "0"
169        prefix_len = int(idx).bit_length()
170
171        _map = _maps.setdefault(_map_name, {}).setdefault(
172            ver,
173            {
174                "version": ver,
175                "sharding_bits": prefix_len,
176                "shards": {},
177                "larges": {},
178                "dtypes": infer_dtype(_map_name),
179            },
180        )
181        if not hash:
182            logging.debug(f"Found map {f} with hash {hash} idx {idx}")
183            _map["shards"][int(idx)] = os.path.join(root, f)
184        else:
185            logging.debug(f"Found large map {f} with hash {hash} idx {idx}")
186            _map["larges"][hash] = os.path.join(root, f)
187        _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len)
188
189    def _handle_obj(name, idx, ext):
190        # quirk for "All.sha1" folder
191        # we don't want sha1.blob_*.tch files
192        if root.endswith("All.sha1"):
193            return
194        
195        _map_name = f"{name}.{ext}"
196        prefix_len = int(idx).bit_length() if idx else 0
197        _obj = _objs.setdefault(
198            _map_name,
199            {
200                "sharding_bits": prefix_len,
201                "shards": {},
202            },
203        )
204        logging.debug(f"Found obj {f} idx {idx}")
205        _obj["shards"][int(idx)] = os.path.join(root, f)
206        _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len)
207
208    for path in paths:
209        # walk the directory for all files
210        for root, _, files in os.walk(path):
211            # only consider .tch, .idx, .bin files
212            files = [
213                f
214                for f in files
215                if ".tch" in f
216                or (not f.startswith("pack") and f.endswith(".idx"))
217                or f.endswith(".bin")
218            ]
219            for idx, f in enumerate(tqdm(files, desc=root)):
220                _r = parse_map_fname(f)
221                if _r:
222                    src, dst, ver, idx = _r
223                    _handle_map(src, dst, ver, idx, None)
224                    continue
225
226                _r = parse_large_fname(f)
227                if _r:
228                    src, dst, ver, idx, hash = _r
229                    _handle_map(src, dst, ver, idx, hash)
230                    continue
231
232                _r = parse_obj_fname(f)
233                if _r:
234                    name, idx, ext = _r
235                    _handle_obj(name, idx, ext)
236                    continue
237                _logger.warning(f"Unrecognized file: {f}")
238
239    # transform maps
240    _ls_maps = {}
241    for k, v in _maps.items():
242        _to_drop = []
243        for ver, vv in v.items():
244            # convert shards to list
245            _ls = [None] * 2 ** vv["sharding_bits"]
246            for kkk, vvv in vv["shards"].items():
247                _ls[kkk] = vvv
248            # see if we can find the None in _ls
249            _nones = [i for i, x in enumerate(_ls) if x is None]
250            if _nones and check_missing:
251                _logger.warning(
252                    f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping'
253                )
254                _logger.warning(f"Got: {vv['shards']}")
255                _to_drop.append(ver)
256            else:
257                vv["shards"] = _ls
258        for ver in _to_drop:
259            del v[ver]
260
261        # move latest maps to the front of the list
262        if len(v) == 0:
263            continue
264        _ls_maps[k] = [
265            vv
266            for _, vv in sorted(
267                v.items(),
268                key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])),
269                reverse=True,
270            )
271        ]
272
273    # transform objects
274    _ls_objs = {}
275    for k, v in _objs.items():
276        # convert shards to list
277        _ls = [None] * 2 ** v["sharding_bits"]
278        for kk, vv in v["shards"].items():
279            _ls[kk] = vv
280        # see if we can find the None in _ls
281        _nones = [i for i, x in enumerate(_ls) if x is None]
282        if _nones and check_missing:
283            _logger.warning(
284                f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping'
285            )
286            _logger.warning(f"Got: {v['shards']}")
287        else:
288            v["shards"] = _ls
289            _ls_objs[k] = v
290
291    # quirk for da* servers
292    # follow symlinks to avoid the overhead of NFS
293    def _resolve_path(file_path: str) -> str:
294        _resolved = os.path.realpath(file_path)
295        if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved):
296            _logger.warning(f"Resolve {file_path} to {_resolved}")
297            return _resolved
298        return file_path
299
300    # transform to v2
301    _total_files = 0
302    for l_maps in _ls_maps.values():
303        for _map in l_maps:
304            # iterate over shards
305            _new_shards = []
306            for shard_path in _map["shards"]:
307                if shard_path is None:
308                    continue
309                _new_shards.append(
310                    {
311                        "path": _resolve_path(shard_path),
312                        "size": os.path.getsize(shard_path),
313                        "digest": None,
314                    }
315                )
316            _map["shards"] = _new_shards
317            _total_files += len(_new_shards)
318
319            # iterate over larges
320            _new_larges = {}
321            for hash, large_path in _map["larges"].items():
322                _new_larges[hash] = {
323                    "path": _resolve_path(large_path),
324                    "size": os.path.getsize(large_path),
325                    "digest": None,
326                }
327            _map["larges"] = _new_larges
328            _total_files += len(_new_larges)
329
330    for obj_name, obj in _ls_objs.items():
331        _new_shards = []
332        for shard_path in obj["shards"]:
333            if shard_path is None:
334                continue
335            _new_shards.append(
336                {
337                    "path": _resolve_path(shard_path),
338                    "size": os.path.getsize(shard_path),
339                    "digest": None,
340                }
341            )
342        obj["shards"] = _new_shards
343        _total_files += len(_new_shards)
344
345        # aliases
346        if obj_name == "tree.tch":
347            obj["alias"] = "tree"
348        elif obj_name == "commit.tch":
349            obj["alias"] = "commit"
350        elif obj_name == "sha1.blob.tch":
351            obj["alias"] = "blob"
352
353    if with_digest:
354        with tqdm(total=_total_files, desc="Calculating digests") as pbar:
355            for l_maps in _ls_maps.values():
356                for _map in l_maps:
357                    for shard in _map["shards"]:
358                        if shard is None:
359                            continue
360                        shard["digest"] = sample_md5(shard["path"])
361                        pbar.update(1)
362                    for large in _map["larges"].values():
363                        large["digest"] = sample_md5(large["path"])
364                        pbar.update(1)
365
366            for obj in _ls_objs.values():
367                for shard in obj["shards"]:
368                    if shard is None:
369                        continue
370                    shard["digest"] = sample_md5(shard["path"])
371                    pbar.update(1)
372
373    # load the preset profile
374    with open(preset_path, "r") as f:
375        res = json.load(f)
376
377    res["maps"] = _ls_maps
378    res["objects"] = _ls_objs
379    return res
380
381
382if __name__ == "__main__":
383    import doctest
384
385    doctest.testmod()
386
387    parser = argparse.ArgumentParser(description="Detect woc profile")
388    parser.add_argument(
389        "paths", metavar="PATH", type=str, nargs="+", help="path to woc directory"
390    )
391    parser.add_argument("--version", type=str, default=None, help="woc mapping version")
392    parser.add_argument(
393        "--preset", type=str, default=_default_profile, help="path to preset profile"
394    )
395    parser.add_argument("--output", type=str, default=None, help="path to output profile")
396    parser.add_argument(
397        "--no-skip-missing",
398        dest="check_missing",
399        action="store_false",
400        help="do not check missing shards",
401    )
402    parser.add_argument(
403        "--with-digest",
404        dest="with_digest",
405        action="store_true",
406        help="calculate digest for each file",
407        default=False,
408    )
409
410    args = parser.parse_args()
411
412    res = detect_profile(
413        args.paths, args.version, args.preset, args.check_missing, args.with_digest
414    )
415    if args.output:
416        with open(args.output, "w") as f:
417            json.dump(res, f, indent=2)
418    else:
419        print(json.dumps(res, indent=2))
MAP_REGEX = '^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch$'

Filename regex for basemap files

def parse_map_fname(fname: str):
29def parse_map_fname(fname: str):
30    """
31    Parse basemap filename into (src, dst, ver, idx).
32
33    >>> parse_map_fname('c2fFullR.3.tch')
34    ('c', 'f', 'R', '3')
35    >>> parse_map_fname('c2fFullR.tch')
36    ('c', 'f', 'R', None)
37    """
38    m = _map_pat.match(fname)
39    if not m or len(m.groups()) != 4:
40        return None
41    return m.groups()

Parse basemap filename into (src, dst, ver, idx).

>>> parse_map_fname('c2fFullR.3woc.tch')
('c', 'f', 'R', '3')
>>> parse_map_fname('c2fFullR.tch')
('c', 'f', 'R', None)
LARGE_REGEX = '^(\\w+)2(\\w+)Full(\\w+)(?:.(\\d+))?.tch.large.([0-9a-f]+)$'

Filename regex for large basemap files

def parse_large_fname(fname: str):
49def parse_large_fname(fname: str):
50    """
51    Parse basemap filename into (src, dst, ver, idx, hash).
52
53    >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
54    ('A', 'c', 'U', '15', '59016a4f')
55    """
56    m = _large_pat.match(fname)
57    if not m or len(m.groups()) != 5:
58        return None
59    return m.groups()

Parse basemap filename into (src, dst, ver, idx, hash).

>>> parse_large_fname('A2cFullU.15.tch.large.59016a4f')
('A', 'c', 'U', '15', '59016a4f')
OBJ_REGEX = '^([\\w\\.]+)_(\\d+).(idx|bin|tch)$'

Filename regex for object files

def parse_obj_fname(fname: str):
67def parse_obj_fname(fname: str):
68    """
69    Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).
70
71    >>> parse_obj_fname('commit_0.tch')
72    ('commit', '0', 'tch')
73    >>> parse_obj_fname('blob_0.idx')
74    ('blob', '0', 'idx')
75    >>> parse_obj_fname('sha1.blob_0.bin')
76    ('sha1.blob', '0', 'bin')
77    """
78    m = _obj_pat.match(fname)
79    if not m or len(m.groups()) != 3:
80        return None
81    return m.groups()

Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext).

>>> parse_obj_fname('commit_0.tch')
('commit', '0', 'tch')
>>> parse_obj_fname('blob_0.idx')
('blob', '0', 'idx')
>>> parse_obj_fname('sha1.blob_0.bin')
('sha1.blob', '0', 'bin')
def compare_woc_version(ver1: str, ver2: str):
84def compare_woc_version(ver1: str, ver2: str):
85    """
86    Compare two woc version strings (A < Z < AA).
87
88    >>> compare_woc_version('S', 'T') > 0
89    False
90    >>> compare_woc_version('AA', 'U') > 0
91    True
92    """
93    if len(ver1) != len(ver2):
94        return len(ver1) - len(ver2)
95    return ord(ver1[0]) - ord(ver2[0])

Compare two woc version strings (A < Z < AA).

>>> compare_woc_version('S', 'T') > 0
False
>>> compare_woc_version('AA', 'U') > 0
True
def infer_dtype(map_name: str) -> Tuple[str, str]:
 98def infer_dtype(map_name: str) -> Tuple[str, str]:
 99    """
100    Infer the data types from the map's name (entity -> entity).
101
102    Should be bug-to-bug compatible with:
103    https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34
104    >>> infer_dtype('c2f')
105    ('h', 'cs')
106    >>> infer_dtype('b2tac')
107    ('h', 'cs3')
108    """
109    ent_all = map_name.lower()
110    ent_in, ent_out = ent_all.split("2")
111
112    dtype_in, dtype_out = "h", "h"
113
114    if ent_in in ("a", "f", "p"):
115        dtype_in = "s"
116    if ent_out in ("a", "f", "p"):
117        dtype_out = "cs"
118    if ent_in in ("c", "b", "w", "ob", "td"):
119        dtype_in = "h"
120    if ent_out in ("c", "b", "cc", "pc", "ob", "td"):
121        dtype_out = "h"
122    if ent_all in ("b2fa", "c2tag"):
123        dtype_out = "sh"
124    if ent_out in ("ta",):
125        dtype_out = "s"
126    if ent_all in ("b2tk", "td2f"):
127        dtype_out = "s"
128    if ent_all in ("c2h", "c2r"):
129        dtype_out = "r"
130    if ent_in in ("ps", "pf", "pfs"):
131        dtype_in = "s"
132    if ent_out in ("ps", "pf", "pfs"):
133        dtype_out = "s"
134    if ent_out in ("rhp",):
135        dtype_out = "hhwww"
136    if ent_all in ("p2p", "a2a"):
137        dtype_in, dtype_out = "s", "cs"
138    if ent_all in ("b2baddate", "b2manyp"):
139        dtype_in, dtype_out = "s", "h"
140    if ent_all in ("c2fbb", "obb2cf", "bb2cf"):
141        dtype_in, dtype_out = "h", "cs3"
142    if ent_all in ("c2dat",):
143        dtype_in, dtype_out = "h", "s"
144    if ent_all in ("b2tac",):
145        dtype_in, dtype_out = "h", "cs3"
146
147    return dtype_in, dtype_out

Infer the data types from the map's name (entity -> entity).

Should be bug-to-bug compatible with: https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34

>>> infer_dtype('c2f')
('h', 'cs')
>>> infer_dtype('b2tac')
('h', 'cs3')
def detect_profile( paths: Iterable[str], version: Union[str, NoneType] = None, preset_path: Union[str, NoneType] = None, check_missing: bool = True, with_digest: bool = False):
150def detect_profile(
151    paths: Iterable[str],
152    version: Optional[str] = None,
153    preset_path: Optional[str] = None,
154    check_missing: bool = True,
155    with_digest: bool = False,
156):
157    _maps, _objs = {}, {}
158
159    if not preset_path:
160        preset_path = _default_profile
161
162    def _handle_map(src, dst, ver, idx, hash):
163        if version and ver != version:
164            logging.info(f"Found map {f} with version {ver}, expected {version}")
165            return
166
167        _map_name = f"{src}2{dst}"
168        if idx is None:
169            idx = "0"
170        prefix_len = int(idx).bit_length()
171
172        _map = _maps.setdefault(_map_name, {}).setdefault(
173            ver,
174            {
175                "version": ver,
176                "sharding_bits": prefix_len,
177                "shards": {},
178                "larges": {},
179                "dtypes": infer_dtype(_map_name),
180            },
181        )
182        if not hash:
183            logging.debug(f"Found map {f} with hash {hash} idx {idx}")
184            _map["shards"][int(idx)] = os.path.join(root, f)
185        else:
186            logging.debug(f"Found large map {f} with hash {hash} idx {idx}")
187            _map["larges"][hash] = os.path.join(root, f)
188        _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len)
189
190    def _handle_obj(name, idx, ext):
191        # quirk for "All.sha1" folder
192        # we don't want sha1.blob_*.tch files
193        if root.endswith("All.sha1"):
194            return
195        
196        _map_name = f"{name}.{ext}"
197        prefix_len = int(idx).bit_length() if idx else 0
198        _obj = _objs.setdefault(
199            _map_name,
200            {
201                "sharding_bits": prefix_len,
202                "shards": {},
203            },
204        )
205        logging.debug(f"Found obj {f} idx {idx}")
206        _obj["shards"][int(idx)] = os.path.join(root, f)
207        _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len)
208
209    for path in paths:
210        # walk the directory for all files
211        for root, _, files in os.walk(path):
212            # only consider .tch, .idx, .bin files
213            files = [
214                f
215                for f in files
216                if ".tch" in f
217                or (not f.startswith("pack") and f.endswith(".idx"))
218                or f.endswith(".bin")
219            ]
220            for idx, f in enumerate(tqdm(files, desc=root)):
221                _r = parse_map_fname(f)
222                if _r:
223                    src, dst, ver, idx = _r
224                    _handle_map(src, dst, ver, idx, None)
225                    continue
226
227                _r = parse_large_fname(f)
228                if _r:
229                    src, dst, ver, idx, hash = _r
230                    _handle_map(src, dst, ver, idx, hash)
231                    continue
232
233                _r = parse_obj_fname(f)
234                if _r:
235                    name, idx, ext = _r
236                    _handle_obj(name, idx, ext)
237                    continue
238                _logger.warning(f"Unrecognized file: {f}")
239
240    # transform maps
241    _ls_maps = {}
242    for k, v in _maps.items():
243        _to_drop = []
244        for ver, vv in v.items():
245            # convert shards to list
246            _ls = [None] * 2 ** vv["sharding_bits"]
247            for kkk, vvv in vv["shards"].items():
248                _ls[kkk] = vvv
249            # see if we can find the None in _ls
250            _nones = [i for i, x in enumerate(_ls) if x is None]
251            if _nones and check_missing:
252                _logger.warning(
253                    f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping'
254                )
255                _logger.warning(f"Got: {vv['shards']}")
256                _to_drop.append(ver)
257            else:
258                vv["shards"] = _ls
259        for ver in _to_drop:
260            del v[ver]
261
262        # move latest maps to the front of the list
263        if len(v) == 0:
264            continue
265        _ls_maps[k] = [
266            vv
267            for _, vv in sorted(
268                v.items(),
269                key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])),
270                reverse=True,
271            )
272        ]
273
274    # transform objects
275    _ls_objs = {}
276    for k, v in _objs.items():
277        # convert shards to list
278        _ls = [None] * 2 ** v["sharding_bits"]
279        for kk, vv in v["shards"].items():
280            _ls[kk] = vv
281        # see if we can find the None in _ls
282        _nones = [i for i, x in enumerate(_ls) if x is None]
283        if _nones and check_missing:
284            _logger.warning(
285                f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping'
286            )
287            _logger.warning(f"Got: {v['shards']}")
288        else:
289            v["shards"] = _ls
290            _ls_objs[k] = v
291
292    # quirk for da* servers
293    # follow symlinks to avoid the overhead of NFS
294    def _resolve_path(file_path: str) -> str:
295        _resolved = os.path.realpath(file_path)
296        if file_path != _resolved and re.match(r"^/da[0-9]+", _resolved):
297            _logger.warning(f"Resolve {file_path} to {_resolved}")
298            return _resolved
299        return file_path
300
301    # transform to v2
302    _total_files = 0
303    for l_maps in _ls_maps.values():
304        for _map in l_maps:
305            # iterate over shards
306            _new_shards = []
307            for shard_path in _map["shards"]:
308                if shard_path is None:
309                    continue
310                _new_shards.append(
311                    {
312                        "path": _resolve_path(shard_path),
313                        "size": os.path.getsize(shard_path),
314                        "digest": None,
315                    }
316                )
317            _map["shards"] = _new_shards
318            _total_files += len(_new_shards)
319
320            # iterate over larges
321            _new_larges = {}
322            for hash, large_path in _map["larges"].items():
323                _new_larges[hash] = {
324                    "path": _resolve_path(large_path),
325                    "size": os.path.getsize(large_path),
326                    "digest": None,
327                }
328            _map["larges"] = _new_larges
329            _total_files += len(_new_larges)
330
331    for obj_name, obj in _ls_objs.items():
332        _new_shards = []
333        for shard_path in obj["shards"]:
334            if shard_path is None:
335                continue
336            _new_shards.append(
337                {
338                    "path": _resolve_path(shard_path),
339                    "size": os.path.getsize(shard_path),
340                    "digest": None,
341                }
342            )
343        obj["shards"] = _new_shards
344        _total_files += len(_new_shards)
345
346        # aliases
347        if obj_name == "tree.tch":
348            obj["alias"] = "tree"
349        elif obj_name == "commit.tch":
350            obj["alias"] = "commit"
351        elif obj_name == "sha1.blob.tch":
352            obj["alias"] = "blob"
353
354    if with_digest:
355        with tqdm(total=_total_files, desc="Calculating digests") as pbar:
356            for l_maps in _ls_maps.values():
357                for _map in l_maps:
358                    for shard in _map["shards"]:
359                        if shard is None:
360                            continue
361                        shard["digest"] = sample_md5(shard["path"])
362                        pbar.update(1)
363                    for large in _map["larges"].values():
364                        large["digest"] = sample_md5(large["path"])
365                        pbar.update(1)
366
367            for obj in _ls_objs.values():
368                for shard in obj["shards"]:
369                    if shard is None:
370                        continue
371                    shard["digest"] = sample_md5(shard["path"])
372                    pbar.update(1)
373
374    # load the preset profile
375    with open(preset_path, "r") as f:
376        res = json.load(f)
377
378    res["maps"] = _ls_maps
379    res["objects"] = _ls_objs
380    return res