Skip to content

SWE-bench

minisweagent.run.extra.swebench

Run mini-SWE-agent on SWEBench instances.

[not dim] More information about the usage: [bold green]https://mini-swe-agent.com/latest/usage/swebench/[/bold green][/not dim]

app module-attribute

app = Typer(rich_markup_mode='rich', add_completion=False)

DATASET_MAPPING module-attribute

DATASET_MAPPING = {
    "full": "princeton-nlp/SWE-Bench",
    "verified": "princeton-nlp/SWE-Bench_Verified",
    "lite": "princeton-nlp/SWE-Bench_Lite",
    "multimodal": "princeton-nlp/SWE-Bench_Multimodal",
    "multilingual": "swe-bench/SWE-Bench_Multilingual",
    "smith": "SWE-bench/SWE-smith",
    "_test": "klieret/swe-bench-dummy-test-dataset",
}

ProgressTrackingAgent

ProgressTrackingAgent(
    *args,
    progress_manager: RunBatchProgressManager,
    instance_id: str = "",
    **kwargs,
)

Bases: DefaultAgent

Simple wrapper around DefaultAgent that provides progress updates.

Source code in src/minisweagent/run/extra/swebench.py
50
51
52
53
def __init__(self, *args, progress_manager: RunBatchProgressManager, instance_id: str = "", **kwargs):
    super().__init__(*args, **kwargs)
    self.progress_manager: RunBatchProgressManager = progress_manager
    self.instance_id = instance_id

progress_manager instance-attribute

progress_manager: RunBatchProgressManager = progress_manager

instance_id instance-attribute

instance_id = instance_id

step

step() -> dict

Override step to provide progress updates.

Source code in src/minisweagent/run/extra/swebench.py
55
56
57
58
59
60
def step(self) -> dict:
    """Override step to provide progress updates."""
    self.progress_manager.update_instance_status(
        self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
    )
    return super().step()

get_swebench_docker_image_name

get_swebench_docker_image_name(instance: dict) -> str

Get the image name for a SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py
63
64
65
66
67
68
69
70
71
def get_swebench_docker_image_name(instance: dict) -> str:
    """Get the image name for a SWEBench instance."""
    image_name = instance.get("image_name", None)
    if image_name is None:
        # Docker doesn't allow double underscore, so we replace them with a magic token
        iid = instance["instance_id"]
        id_docker_compatible = iid.replace("__", "_1776_")
        image_name = f"swebench/sweb.eval.x86_64.{id_docker_compatible}:latest".lower()
    return image_name

update_preds_file

update_preds_file(
    output_path: Path,
    instance_id: str,
    model_name: str,
    result: str,
)

Update the output JSON file with results from a single instance.

Source code in src/minisweagent/run/extra/swebench.py
74
75
76
77
78
79
80
81
82
83
84
85
def update_preds_file(output_path: Path, instance_id: str, model_name: str, result: str):
    """Update the output JSON file with results from a single instance."""
    with _OUTPUT_FILE_LOCK:
        output_data = {}
        if output_path.exists():
            output_data = json.loads(output_path.read_text())
        output_data[instance_id] = {
            "model_name_or_path": model_name,
            "instance_id": instance_id,
            "model_patch": result,
        }
        output_path.write_text(json.dumps(output_data, indent=2))

remove_from_preds_file

remove_from_preds_file(output_path: Path, instance_id: str)

Remove an instance from the predictions file.

Source code in src/minisweagent/run/extra/swebench.py
88
89
90
91
92
93
94
95
96
def remove_from_preds_file(output_path: Path, instance_id: str):
    """Remove an instance from the predictions file."""
    if not output_path.exists():
        return
    with _OUTPUT_FILE_LOCK:
        output_data = json.loads(output_path.read_text())
        if instance_id in output_data:
            del output_data[instance_id]
            output_path.write_text(json.dumps(output_data, indent=2))

process_instance

process_instance(
    instance: dict,
    output_dir: Path,
    model_name: str | None,
    config_path: str | Path,
    progress_manager: RunBatchProgressManager,
) -> None

Process a single SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def process_instance(
    instance: dict,
    output_dir: Path,
    model_name: str | None,
    config_path: str | Path,
    progress_manager: RunBatchProgressManager,
) -> None:
    """Process a single SWEBench instance."""
    instance_id = instance["instance_id"]
    instance_dir = output_dir / instance_id
    # avoid inconsistent state if something here fails and there's leftover previous files
    remove_from_preds_file(output_dir / "preds.json", instance_id)
    (instance_dir / f"{instance_id}.traj.json").unlink(missing_ok=True)

    image_name = get_swebench_docker_image_name(instance)
    config = yaml.safe_load(get_config_path(config_path).read_text())
    model = get_model(model_name, config=config.get("model", {}))
    task = instance["problem_statement"]

    progress_manager.on_instance_start(instance_id)
    progress_manager.update_instance_status(instance_id, "Pulling/starting docker")

    agent = None
    extra_info = None

    try:
        env = DockerEnvironment(**(config.get("environment", {}) | {"image": image_name}))
        agent = ProgressTrackingAgent(
            model,
            env,
            progress_manager=progress_manager,
            instance_id=instance_id,
            **config.get("agent", {}),
        )
        exit_status, result = agent.run(task)
    except Exception as e:
        print(f"Error processing instance {instance_id}: {e}\n{traceback.format_exc()}")
        exit_status, result = type(e).__name__, str(e)
        extra_info = {"traceback": traceback.format_exc()}
    finally:
        save_traj(
            agent,
            instance_dir / f"{instance_id}.traj.json",
            exit_status=exit_status,
            result=result,
            extra_info=extra_info,
            instance_id=instance_id,
        )
        update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
        progress_manager.on_instance_end(instance_id, exit_status)

filter_instances

filter_instances(
    instances: list[dict],
    *,
    filter_spec: str,
    slice_spec: str = "",
    shuffle: bool = False,
) -> list[dict]

Filter and slice a list of SWEBench instances.

Source code in src/minisweagent/run/extra/swebench.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def filter_instances(
    instances: list[dict], *, filter_spec: str, slice_spec: str = "", shuffle: bool = False
) -> list[dict]:
    """Filter and slice a list of SWEBench instances."""
    if shuffle:
        instances = sorted(instances.copy(), key=lambda x: x["instance_id"])
        random.seed(42)
        random.shuffle(instances)
    before_filter = len(instances)
    instances = [instance for instance in instances if re.match(filter_spec, instance["instance_id"])]
    if (after_filter := len(instances)) != before_filter:
        print(f"Instance filter: {before_filter} -> {after_filter} instances")
    if slice_spec:
        values = [int(x) if x else None for x in slice_spec.split(":")]
        instances = instances[slice(*values)]
        if (after_slice := len(instances)) != before_filter:
            print(f"Instance slice: {before_filter} -> {after_slice} instances")
    return instances

main

main(
    subset: str = Option(
        "lite",
        "--subset",
        help="SWEBench subset to use or path to a dataset",
    ),
    split: str = Option(
        "dev", "--split", help="Dataset split"
    ),
    slice_spec: str = Option(
        "",
        "--slice",
        help="Slice specification (e.g., '0:5' for first 5 instances)",
    ),
    filter_spec: str = Option(
        "", "--filter", help="Filter instance IDs by regex"
    ),
    shuffle: bool = Option(
        False, "--shuffle", help="Shuffle instances"
    ),
    output: str = Option(
        "", "-o", "--output", help="Output directory"
    ),
    workers: int = Option(
        1,
        "-w",
        "--workers",
        help="Number of worker threads for parallel processing",
    ),
    model: str | None = Option(
        None, "-m", "--model", help="Model to use"
    ),
    redo_existing: bool = Option(
        False,
        "--redo-existing",
        help="Redo existing instances",
    ),
    config: Path = Option(
        builtin_config_dir / "extra" / "swebench.yaml",
        "-c",
        "--config",
        help="Path to a config file",
    ),
) -> None

Run mini-SWE-agent on SWEBench instances

Source code in src/minisweagent/run/extra/swebench.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
@app.command()
def main(
    subset: str = typer.Option("lite", "--subset", help="SWEBench subset to use or path to a dataset"),
    split: str = typer.Option("dev", "--split", help="Dataset split"),
    slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g., '0:5' for first 5 instances)"),
    filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex"),
    shuffle: bool = typer.Option(False, "--shuffle", help="Shuffle instances"),
    output: str = typer.Option("", "-o", "--output", help="Output directory"),
    workers: int = typer.Option(1, "-w", "--workers", help="Number of worker threads for parallel processing"),
    model: str | None = typer.Option(None, "-m", "--model", help="Model to use"),
    redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances"),
    config: Path = typer.Option(
        builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file"
    ),
) -> None:
    """Run mini-SWE-agent on SWEBench instances"""
    dataset_path = DATASET_MAPPING.get(subset, subset)
    print(f"Loading dataset {dataset_path}, split {split}...")
    instances = list(load_dataset(dataset_path, split=split))

    instances = filter_instances(instances, filter_spec=filter_spec, slice_spec=slice_spec, shuffle=shuffle)
    output_path = Path(output)
    if not redo_existing and (output_path / "preds.json").exists():
        existing_instances = list(json.loads((output_path / "preds.json").read_text()).keys())
        print(f"Skipping {len(existing_instances)} existing instances")
        instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]

    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Running on {len(instances)} instances...")
    print(f"Results will be saved to {output_path}")

    progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")

    def process_futures(futures: dict[concurrent.futures.Future, str]):
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except concurrent.futures.CancelledError:
                pass
            except Exception as e:
                instance_id = futures[future]
                print(f"Error in future for instance {instance_id}: {e}")
                traceback.print_exc()
                progress_manager.on_uncaught_exception(instance_id, e)

    with Live(progress_manager.render_group, refresh_per_second=4):
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            futures = {
                executor.submit(process_instance, instance, output_path, model, config, progress_manager): instance[
                    "instance_id"
                ]
                for instance in instances
            }
            try:
                process_futures(futures)
            except KeyboardInterrupt:
                print("Cancelling all pending jobs. Press ^C again to exit immediately.")
                for future in futures:
                    if not future.running() and not future.done():
                        future.cancel()
                process_futures(futures)