in src/autotrain/backends/nvcf.py [0:0]
def _poll_nvcf(self, url, token, job_name, method="get", timeout=86400, interval=30, op="poll"):
timeout = float(timeout)
interval = float(interval)
start_time = time.time()
success = False
last_full_log = ""
while time.time() - start_time < timeout:
try:
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
if method.upper() == "GET":
response = requests.get(url, headers=headers)
else:
raise ValueError(f"Unsupported HTTP method: {method}")
if response.status_code == 404 and success:
break
response.raise_for_status()
try:
data = response.json()
except ValueError:
logger.error("Failed to parse JSON from response")
continue
if response.status_code == 500:
logger.error("Training failed")
if "detail" in data:
detail_message = data["detail"]
for line in detail_message.split("\n"):
if line.strip():
print(line)
break
if response.status_code in [200, 202]:
logger.info(
f"{job_name}: {method} - {response.status_code} - {'Polling completed' if response.status_code == 200 else 'Polling reqId for completion'}"
)
if "log" in data:
current_full_log = data["log"]
if current_full_log != last_full_log:
new_log_content = current_full_log[len(last_full_log) :]
for line in new_log_content.split("\n"):
if line.strip():
print(line)
last_full_log = current_full_log
if response.status_code == 200:
success = True
except requests.HTTPError as http_err:
if not (http_err.response.status_code == 404 and success):
logger.error(f"HTTP error occurred: {http_err}")
except (requests.ConnectionError, ValueError) as err:
logger.error(f"Error while handling request: {err}")
time.sleep(interval)
if not success:
raise TimeoutError(f"Operation '{op}' did not complete successfully within the timeout period.")