5-4o_fine_tuning/util.py (42 lines of code) (raw):
import json
import subprocess
from collections import OrderedDict
from datasets import load_dataset
def get_semgrep_version():
"""
Executes the 'semgrep --version' command to retrieve the current version of semgrep installed.
Tries to run the 'semgrep --version' command and parse its output to extract the version number.
If the command execution fails, it catches the exception and returns 'unknown version'.
Returns:
str: The version of semgrep if the command succeeds, otherwise 'unknown version'.
"""
try:
result = subprocess.run(
["semgrep", "--version"], capture_output=True, text=True)
return result.stdout.strip().split()[-1]
except subprocess.CalledProcessError:
return "unknown version"
def is_fully_commented(text: str) -> bool:
"""
Checks if the given text is fully commented out.
Args:
text (str): The input text to check.
Returns:
bool: True if all non-empty lines in the text are commented out, False otherwise.
"""
lines = text.split('\n')
for line in lines:
if line != "" and not line.startswith("#"):
return False
return True
def clean_code_snippet(response):
"""
Cleans a code snippet by removing the opening and closing code block delimiters.
Args:
response (str): The code snippet to clean.
Returns:
str: The cleaned code snippet without the code block delimiters.
"""
if response.startswith("```python"):
response = response[len("```python"):]
elif response.startswith("```"):
response = response[len("```"):]
if response.endswith("```"):
response = response[:-len("```")]
return response.strip()
def _load_hf_dataset_and_export_to_jsonl(path: str, split_name: str, output_file: str):
"""
Loads a huggingface dataset using a given split and exports it to a JSONL file.
"""
try:
dataset = load_dataset(path=path)
split_data = dataset[split_name]
counter = 0
with open(output_file, "w") as f:
for item in split_data:
json.dump(OrderedDict(item), f)
counter += f.write("\n")
print(f"{counter} lines converted and saved to {output_file}")
except Exception as e:
print(f"An error occurred while loading or exporting the dataset: {e}")
if __name__ == "__main__":
_load_hf_dataset_and_export_to_jsonl(
"patched-codes/synth-vuln-fixes", "train", "synth-vuln-fixes-train.jsonl")
_load_hf_dataset_and_export_to_jsonl(
"patched-codes/static-analysis-eval", "train", "static-vuln-fixes-eval.jsonl")