SWE-bench

minisweagent.run.extra.swebench

Run mini-SWE-agent on SWEBench instances.

[not dim] More information about the usage: [bold green]https://mini-swe-agent.com/latest/usage/swebench/[/bold green][/not dim]

app `module-attribute`

app = Typer(rich_markup_mode='rich', add_completion=False)

DATASET_MAPPING `module-attribute`

DATASET_MAPPING = {
    "full": "princeton-nlp/SWE-Bench",
    "verified": "princeton-nlp/SWE-Bench_Verified",
    "lite": "princeton-nlp/SWE-Bench_Lite",
    "multimodal": "princeton-nlp/SWE-Bench_Multimodal",
    "multilingual": "swe-bench/SWE-Bench_Multilingual",
    "smith": "SWE-bench/SWE-smith",
    "_test": "klieret/swe-bench-dummy-test-dataset",
}

ProgressTrackingAgent

ProgressTrackingAgent(
    *args,
    progress_manager: RunBatchProgressManager,
    instance_id: str = "",
    **kwargs,
)

Bases: DefaultAgent

Simple wrapper around DefaultAgent that provides progress updates.

Source code in src/minisweagent/run/extra/swebench.py

def __init__(self, *args, progress_manager: RunBatchProgressManager, instance_id: str = "", **kwargs):
    super().__init__(*args, **kwargs)
    self.progress_manager: RunBatchProgressManager = progress_manager
    self.instance_id = instance_id

progress_manager `instance-attribute`

progress_manager: RunBatchProgressManager = progress_manager

instance_id `instance-attribute`

instance_id = instance_id

step

step() -> dict

Override step to provide progress updates.

Source code in src/minisweagent/run/extra/swebench.py

def step(self) -> dict:
    """Override step to provide progress updates."""
    self.progress_manager.update_instance_status(
        self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
    )
    return super().step()

get_swebench_docker_image_name

get_swebench_docker_image_name(instance: dict) -> str

Get the image name for a SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py

def get_swebench_docker_image_name(instance: dict) -> str:
    """Get the image name for a SWEBench instance."""
    image_name = instance.get("image_name", None)
    if image_name is None:
        # Docker doesn't allow double underscore, so we replace them with a magic token
        iid = instance["instance_id"]
        id_docker_compatible = iid.replace("__", "_1776_")
        image_name = f"swebench/sweb.eval.x86_64.{id_docker_compatible}:latest".lower()
    return image_name

update_preds_file

update_preds_file(
    output_path: Path,
    instance_id: str,
    model_name: str,
    result: str,
)

Update the output JSON file with results from a single instance.

Source code in src/minisweagent/run/extra/swebench.py

def update_preds_file(output_path: Path, instance_id: str, model_name: str, result: str):
    """Update the output JSON file with results from a single instance."""
    with _OUTPUT_FILE_LOCK:
        output_data = {}
        if output_path.exists():
            output_data = json.loads(output_path.read_text())
        output_data[instance_id] = {
            "model_name_or_path": model_name,
            "instance_id": instance_id,
            "model_patch": result,
        }
        output_path.write_text(json.dumps(output_data, indent=2))

remove_from_preds_file

remove_from_preds_file(output_path: Path, instance_id: str)

Remove an instance from the predictions file.

Source code in src/minisweagent/run/extra/swebench.py

def remove_from_preds_file(output_path: Path, instance_id: str):
    """Remove an instance from the predictions file."""
    if not output_path.exists():
        return
    with _OUTPUT_FILE_LOCK:
        output_data = json.loads(output_path.read_text())
        if instance_id in output_data:
            del output_data[instance_id]
            output_path.write_text(json.dumps(output_data, indent=2))

process_instance

process_instance(
    instance: dict,
    output_dir: Path,
    model_name: str | None,
    config_path: str | Path,
    progress_manager: RunBatchProgressManager,
) -> None

Process a single SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py

def process_instance(
    instance: dict,
    output_dir: Path,
    model_name: str | None,
    config_path: str | Path,
    progress_manager: RunBatchProgressManager,
) -> None:
    """Process a single SWEBench instance."""
    instance_id = instance["instance_id"]
    instance_dir = output_dir / instance_id
    # avoid inconsistent state if something here fails and there's leftover previous files
    remove_from_preds_file(output_dir / "preds.json", instance_id)
    (instance_dir / f"{instance_id}.traj.json").unlink(missing_ok=True)

    image_name = get_swebench_docker_image_name(instance)
    config = yaml.safe_load(get_config_path(config_path).read_text())
    model = get_model(model_name, config=config.get("model", {}))
    task = instance["problem_statement"]

    progress_manager.on_instance_start(instance_id)
    progress_manager.update_instance_status(instance_id, "Pulling/starting docker")

    agent = None
    extra_info = None

    try:
        env = DockerEnvironment(**(config.get("environment", {}) | {"image": image_name}))
        agent = ProgressTrackingAgent(
            model,
            env,
            progress_manager=progress_manager,
            instance_id=instance_id,
            **config.get("agent", {}),
        )
        exit_status, result = agent.run(task)
    except Exception as e:
        print(f"Error processing instance {instance_id}: {e}\n{traceback.format_exc()}")
        exit_status, result = type(e).__name__, str(e)
        extra_info = {"traceback": traceback.format_exc()}
    finally:
        save_traj(
            agent,
            instance_dir / f"{instance_id}.traj.json",
            exit_status=exit_status,
            result=result,
            extra_info=extra_info,
            instance_id=instance_id,
        )
        update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
        progress_manager.on_instance_end(instance_id, exit_status)

filter_instances

filter_instances(
    instances: list[dict],
    *,
    filter_spec: str,
    slice_spec: str = "",
    shuffle: bool = False,
) -> list[dict]

Filter and slice a list of SWEBench instances.

Source code in src/minisweagent/run/extra/swebench.py

def filter_instances(
    instances: list[dict], *, filter_spec: str, slice_spec: str = "", shuffle: bool = False
) -> list[dict]:
    """Filter and slice a list of SWEBench instances."""
    if shuffle:
        instances = sorted(instances.copy(), key=lambda x: x["instance_id"])
        random.seed(42)
        random.shuffle(instances)
    before_filter = len(instances)
    instances = [instance for instance in instances if re.match(filter_spec, instance["instance_id"])]
    if (after_filter := len(instances)) != before_filter:
        print(f"Instance filter: {before_filter} -> {after_filter} instances")
    if slice_spec:
        values = [int(x) if x else None for x in slice_spec.split(":")]
        instances = instances[slice(*values)]
        if (after_slice := len(instances)) != before_filter:
            print(f"Instance slice: {before_filter} -> {after_slice} instances")
    return instances

main

main(
    subset: str = Option(
        "lite",
        "--subset",
        help="SWEBench subset to use or path to a dataset",
    ),
    split: str = Option(
        "dev", "--split", help="Dataset split"
    ),
    slice_spec: str = Option(
        "",
        "--slice",
        help="Slice specification (e.g., '0:5' for first 5 instances)",
    ),
    filter_spec: str = Option(
        "", "--filter", help="Filter instance IDs by regex"
    ),
    shuffle: bool = Option(
        False, "--shuffle", help="Shuffle instances"
    ),
    output: str = Option(
        "", "-o", "--output", help="Output directory"
    ),
    workers: int = Option(
        1,
        "-w",
        "--workers",
        help="Number of worker threads for parallel processing",
    ),
    model: str | None = Option(
        None, "-m", "--model", help="Model to use"
    ),
    redo_existing: bool = Option(
        False,
        "--redo-existing",
        help="Redo existing instances",
    ),
    config: Path = Option(
        builtin_config_dir / "extra" / "swebench.yaml",
        "-c",
        "--config",
        help="Path to a config file",
    ),
) -> None

Run mini-SWE-agent on SWEBench instances

Source code in src/minisweagent/run/extra/swebench.py

@app.command()
def main(
    subset: str = typer.Option("lite", "--subset", help="SWEBench subset to use or path to a dataset"),
    split: str = typer.Option("dev", "--split", help="Dataset split"),
    slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g., '0:5' for first 5 instances)"),
    filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex"),
    shuffle: bool = typer.Option(False, "--shuffle", help="Shuffle instances"),
    output: str = typer.Option("", "-o", "--output", help="Output directory"),
    workers: int = typer.Option(1, "-w", "--workers", help="Number of worker threads for parallel processing"),
    model: str | None = typer.Option(None, "-m", "--model", help="Model to use"),
    redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances"),
    config: Path = typer.Option(
        builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file"
    ),
) -> None:
    """Run mini-SWE-agent on SWEBench instances"""
    dataset_path = DATASET_MAPPING.get(subset, subset)
    print(f"Loading dataset {dataset_path}, split {split}...")
    instances = list(load_dataset(dataset_path, split=split))

    instances = filter_instances(instances, filter_spec=filter_spec, slice_spec=slice_spec, shuffle=shuffle)
    output_path = Path(output)
    if not redo_existing and (output_path / "preds.json").exists():
        existing_instances = list(json.loads((output_path / "preds.json").read_text()).keys())
        print(f"Skipping {len(existing_instances)} existing instances")
        instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]

    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Running on {len(instances)} instances...")
    print(f"Results will be saved to {output_path}")

    progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")

    def process_futures(futures: dict[concurrent.futures.Future, str]):
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except concurrent.futures.CancelledError:
                pass
            except Exception as e:
                instance_id = futures[future]
                print(f"Error in future for instance {instance_id}: {e}")
                traceback.print_exc()
                progress_manager.on_uncaught_exception(instance_id, e)

    with Live(progress_manager.render_group, refresh_per_second=4):
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            futures = {
                executor.submit(process_instance, instance, output_path, model, config, progress_manager): instance[
                    "instance_id"
                ]
                for instance in instances
            }
            try:
                process_futures(futures)
            except KeyboardInterrupt:
                print("Cancelling all pending jobs. Press ^C again to exit immediately.")
                for future in futures:
                    if not future.running() and not future.done():
                        future.cancel()
                process_futures(futures)