-
Notifications
You must be signed in to change notification settings - Fork 607
Open
Description
Would you guys mind adding endpoint argument to the s3_sync in art.utils.s3
65 @limit_concurrency(1)
66 async def s3_sync(
67 source: str,
68 destination: str,
69 *,
70 profile: Optional[str] = None,
71 verbose: bool = False,
72 delete: bool = False,
73 exclude: list[ExcludableOption] | None = None,
74 ) -> None:
75 """Synchronise *source* and *destination* using the AWS CLI.
76
77 Either *source* or *destination* (or both) can point to an S3 URI, making it
78 possible to copy from local disk to S3 or from S3 to local disk.
79
80 The function is asynchronous: while the `aws` process runs, control is
81 yielded back to the event loop so other tasks can continue executing.
82
83 Args:
84 source: The *from* path. Can be a local path or an ``s3://`` URI.
85 destination: The *to* path. Can be a local path or an ``s3://`` URI.
86 profile: Optional AWS profile name to pass to the CLI.
87 verbose: When *True*, the output of the AWS CLI is streamed to the
88 calling process; otherwise it is suppressed.
89 exclude: List of directories to exclude from sync.
90
91 Raises:
92 S3SyncError: If the *aws s3 sync* command exits with a non‑zero status.
93 """
94
95 cmd: list[str] = ["aws"]
96
97 if profile:
98 cmd += ["--profile", profile]
99
100 cmd += ["s3"]
101 # use cp for files, sync for directories
102 if os.path.isfile(source):
103 cmd += ["cp"]
104 else:
105 cmd += ["sync"]
106
107 ENDPOINT = os.getenv("AWS_ENDPOINT_URL")
108 if ENDPOINT:
109 cmd += ["--endpoint", ENDPOINT]
110
111 if delete:
112 cmd.append("--delete")
113
114 # Add exclude patterns for each excluded directory
115 if exclude:
116 for excluded_dir in exclude:
117 cmd.extend(["--exclude", f"{excluded_dir}/*"])
118
119 cmd += [source, destination]otherwise there are concurrency issues that pop up during sync.
upload failed: .art/2048/models/tutorial-007/checkpoints/0003/adapter_model.safetensors to s3://testingart/2048/models/tutorial-007/checkpoints/0003/adapter_model.safetensors 'ETag'
Traceback (most recent call last):
File "/workspace/ART/examples/2048/train.py", line 82, in <module>
asyncio.run(train())
File "/usr/local/lib/python3.12/dist-packages/nest_asyncio.py", line 30, in run
return loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/nest_asyncio.py", line 98, in run_until_complete
return f.result()
^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/futures.py", line 203, in result
raise self._exception.with_traceback(self._exception_tb)
File "/usr/lib/python3.12/asyncio/tasks.py", line 314, in __step_run_and_handle_result
result = coro.send(None)
^^^^^^^^^^^^^^^
File "/workspace/ART/examples/2048/train.py", line 70, in train
await backend._experimental_push_to_s3(
File "/usr/local/lib/python3.12/dist-packages/art/local/backend.py", line 748, in _experimental_push_to_s3
await push_model_to_s3(
File "/usr/local/lib/python3.12/dist-packages/art/utils/s3.py", line 265, in push_model_to_s3
await s3_sync(local_model_dir, s3_path, verbose=verbose, delete=delete)
File "/usr/local/lib/python3.12/dist-packages/art/utils/limit_concurrency.py", line 21, in wrapper
return await func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/art/utils/s3.py", line 129, in s3_sync
raise S3SyncError(f"{' '.join(cmd)} exited with status {return_code}")
art.utils.s3.S3SyncError: aws --endpoint http://minio:9000 s3 sync ./.art/2048/models/tutorial-007 s3://testingart/2048/models/tutorial-007 exited with status 1
^C^X^Z
[4]+ Stopped python3 train.py
root@art:/workspace/ART/examples/2048# exit
exit
There are stopped jobs.
```bash
root@art:/workspace/ART/examples/2048# aws --endpoint http://minio:9000 s3 sync ./.art/2048/models/tutorial-007 s3://testingart/2048/models/tutorial-007
upload: .art/2048/models/tutorial-007/logs/vllm.log to s3://testingart/2048/models/tutorial-007/logs/vllm.log
upload: .art/2048/models/tutorial-007/checkpoints/0003/adapter_model.safetensors to s3://testingart/2048/models/tutorial-007/checkpoints/0003/adapter_model.safetensors
root@art:/workspace/ART/examples/2048# env|grep AWSD
root@art:/workspace/ART/examples/2048# env|grep AWS
AWS_DEFAULT_REGION=us-east-1
AWS_DEFAULT_OUTPUT=json
AWS_SECRET_ACCESS_KEY=<removed>
AWS_OFI_NCCL_VERSION=1.14.0
AWS_ACCESS_KEY_ID=<removed>
AWS_ENDPOINT_URL=http://minio:9000
#also need AWS_MAX_UPLOAD_SIZE set to avoid the Etag issue during sync
root@art:/workspace/ART/examples/2048# export AWS_MAX_UPLOAD_SIZE=104857600I think this is because you guys are using older version of aws cli. But this workaround fixes this issue when using minio during training. Also AWS_DEFAULT_OUTPUT is needed otherwise an error pops up because of older aws version not defaulting the output.
Metadata
Metadata
Assignees
Labels
No labels