Skip to content

SWE-bench

SWE-bench run script

minisweagent.run.extra.swebench

Run mini-SWE-agent on SWE-bench instances in batch mode.

app module-attribute

app = Typer(rich_markup_mode='rich', add_completion=False)

DATASET_MAPPING module-attribute

DATASET_MAPPING = {
    "full": "princeton-nlp/SWE-Bench",
    "verified": "princeton-nlp/SWE-Bench_Verified",
    "lite": "princeton-nlp/SWE-Bench_Lite",
    "multimodal": "princeton-nlp/SWE-Bench_Multimodal",
    "multilingual": "swe-bench/SWE-Bench_Multilingual",
    "smith": "SWE-bench/SWE-smith",
    "_test": "klieret/swe-bench-dummy-test-dataset",
}

ProgressTrackingAgent

ProgressTrackingAgent(
    *args,
    progress_manager: RunBatchProgressManager,
    instance_id: str = "",
    **kwargs,
)

Bases: DefaultAgent

Simple wrapper around DefaultAgent that provides progress updates.

Source code in src/minisweagent/run/extra/swebench.py
56
57
58
59
def __init__(self, *args, progress_manager: RunBatchProgressManager, instance_id: str = "", **kwargs):
    super().__init__(*args, **kwargs)
    self.progress_manager: RunBatchProgressManager = progress_manager
    self.instance_id = instance_id

progress_manager instance-attribute

progress_manager: RunBatchProgressManager = progress_manager

instance_id instance-attribute

instance_id = instance_id

step

step() -> dict

Override step to provide progress updates.

Source code in src/minisweagent/run/extra/swebench.py
61
62
63
64
65
66
def step(self) -> dict:
    """Override step to provide progress updates."""
    self.progress_manager.update_instance_status(
        self.instance_id, f"Step {self.model.n_calls + 1:3d} (${self.model.cost:.2f})"
    )
    return super().step()

get_swebench_docker_image_name

get_swebench_docker_image_name(instance: dict) -> str

Get the image name for a SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py
69
70
71
72
73
74
75
76
77
def get_swebench_docker_image_name(instance: dict) -> str:
    """Get the image name for a SWEBench instance."""
    image_name = instance.get("image_name", None)
    if image_name is None:
        # Docker doesn't allow double underscore, so we replace them with a magic token
        iid = instance["instance_id"]
        id_docker_compatible = iid.replace("__", "_1776_")
        image_name = f"docker.io/swebench/sweb.eval.x86_64.{id_docker_compatible}:latest".lower()
    return image_name

get_sb_environment

get_sb_environment(
    config: dict, instance: dict
) -> Environment
Source code in src/minisweagent/run/extra/swebench.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def get_sb_environment(config: dict, instance: dict) -> Environment:
    env_config = config.setdefault("environment", {})
    env_config["environment_class"] = env_config.get("environment_class", "docker")
    image_name = get_swebench_docker_image_name(instance)
    if env_config["environment_class"] == "docker":
        env_config["image"] = image_name
    elif env_config["environment_class"] == "singularity":
        env_config["image"] = "docker://" + image_name
    env = get_environment(env_config)
    if startup_command := config.get("run", {}).get("env_startup_command"):
        startup_command = Template(startup_command, undefined=StrictUndefined).render(**instance)
        out = env.execute(startup_command)
        if out["returncode"] != 0:
            raise RuntimeError(f"Error executing startup command: {out}")
    return env

update_preds_file

update_preds_file(
    output_path: Path,
    instance_id: str,
    model_name: str,
    result: str,
)

Update the output JSON file with results from a single instance.

Source code in src/minisweagent/run/extra/swebench.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
def update_preds_file(output_path: Path, instance_id: str, model_name: str, result: str):
    """Update the output JSON file with results from a single instance."""
    with _OUTPUT_FILE_LOCK:
        output_data = {}
        if output_path.exists():
            output_data = json.loads(output_path.read_text())
        output_data[instance_id] = {
            "model_name_or_path": model_name,
            "instance_id": instance_id,
            "model_patch": result,
        }
        output_path.write_text(json.dumps(output_data, indent=2))

remove_from_preds_file

remove_from_preds_file(output_path: Path, instance_id: str)

Remove an instance from the predictions file.

Source code in src/minisweagent/run/extra/swebench.py
111
112
113
114
115
116
117
118
119
def remove_from_preds_file(output_path: Path, instance_id: str):
    """Remove an instance from the predictions file."""
    if not output_path.exists():
        return
    with _OUTPUT_FILE_LOCK:
        output_data = json.loads(output_path.read_text())
        if instance_id in output_data:
            del output_data[instance_id]
            output_path.write_text(json.dumps(output_data, indent=2))

process_instance

process_instance(
    instance: dict,
    output_dir: Path,
    config: dict,
    progress_manager: RunBatchProgressManager,
) -> None

Process a single SWEBench instance.

Source code in src/minisweagent/run/extra/swebench.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def process_instance(
    instance: dict,
    output_dir: Path,
    config: dict,
    progress_manager: RunBatchProgressManager,
) -> None:
    """Process a single SWEBench instance."""
    instance_id = instance["instance_id"]
    instance_dir = output_dir / instance_id
    # avoid inconsistent state if something here fails and there's leftover previous files
    remove_from_preds_file(output_dir / "preds.json", instance_id)
    (instance_dir / f"{instance_id}.traj.json").unlink(missing_ok=True)
    model = get_model(config=config.get("model", {}))
    task = instance["problem_statement"]

    progress_manager.on_instance_start(instance_id)
    progress_manager.update_instance_status(instance_id, "Pulling/starting docker")

    agent = None
    extra_info = None

    try:
        env = get_sb_environment(config, instance)
        agent = ProgressTrackingAgent(
            model,
            env,
            progress_manager=progress_manager,
            instance_id=instance_id,
            **config.get("agent", {}),
        )
        exit_status, result = agent.run(task)
    except Exception as e:
        logger.error(f"Error processing instance {instance_id}: {e}", exc_info=True)
        exit_status, result = type(e).__name__, str(e)
        extra_info = {"traceback": traceback.format_exc()}
    finally:
        save_traj(
            agent,
            instance_dir / f"{instance_id}.traj.json",
            exit_status=exit_status,
            result=result,
            extra_info=extra_info,
            instance_id=instance_id,
            print_fct=logger.info,
        )
        update_preds_file(output_dir / "preds.json", instance_id, model.config.model_name, result)
        progress_manager.on_instance_end(instance_id, exit_status)

filter_instances

filter_instances(
    instances: list[dict],
    *,
    filter_spec: str,
    slice_spec: str = "",
    shuffle: bool = False,
) -> list[dict]

Filter and slice a list of SWEBench instances.

Source code in src/minisweagent/run/extra/swebench.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def filter_instances(
    instances: list[dict], *, filter_spec: str, slice_spec: str = "", shuffle: bool = False
) -> list[dict]:
    """Filter and slice a list of SWEBench instances."""
    if shuffle:
        instances = sorted(instances.copy(), key=lambda x: x["instance_id"])
        random.seed(42)
        random.shuffle(instances)
    before_filter = len(instances)
    instances = [instance for instance in instances if re.match(filter_spec, instance["instance_id"])]
    if (after_filter := len(instances)) != before_filter:
        logger.info(f"Instance filter: {before_filter} -> {after_filter} instances")
    if slice_spec:
        values = [int(x) if x else None for x in slice_spec.split(":")]
        instances = instances[slice(*values)]
        if (after_slice := len(instances)) != before_filter:
            logger.info(f"Instance slice: {before_filter} -> {after_slice} instances")
    return instances

main

main(
    subset: str = Option(
        "lite",
        "--subset",
        help="SWEBench subset to use or path to a dataset",
        rich_help_panel="Data selection",
    ),
    split: str = Option(
        "dev",
        "--split",
        help="Dataset split",
        rich_help_panel="Data selection",
    ),
    slice_spec: str = Option(
        "",
        "--slice",
        help="Slice specification (e.g., '0:5' for first 5 instances)",
        rich_help_panel="Data selection",
    ),
    filter_spec: str = Option(
        "",
        "--filter",
        help="Filter instance IDs by regex",
        rich_help_panel="Data selection",
    ),
    shuffle: bool = Option(
        False,
        "--shuffle",
        help="Shuffle instances",
        rich_help_panel="Data selection",
    ),
    output: str = Option(
        "",
        "-o",
        "--output",
        help="Output directory",
        rich_help_panel="Basic",
    ),
    workers: int = Option(
        1,
        "-w",
        "--workers",
        help="Number of worker threads for parallel processing",
        rich_help_panel="Basic",
    ),
    model: str | None = Option(
        None,
        "-m",
        "--model",
        help="Model to use",
        rich_help_panel="Basic",
    ),
    model_class: str | None = Option(
        None,
        "-c",
        "--model-class",
        help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')",
        rich_help_panel="Advanced",
    ),
    redo_existing: bool = Option(
        False,
        "--redo-existing",
        help="Redo existing instances",
        rich_help_panel="Data selection",
    ),
    config_spec: Path = Option(
        builtin_config_dir / "extra" / "swebench.yaml",
        "-c",
        "--config",
        help="Path to a config file",
        rich_help_panel="Basic",
    ),
    environment_class: str | None = Option(
        None,
        "--environment-class",
        help="Environment type to use. Recommended are docker or singularity",
        rich_help_panel="Advanced",
    ),
) -> None
Source code in src/minisweagent/run/extra/swebench.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
@app.command(help=_HELP_TEXT)
def main(
    subset: str = typer.Option("lite", "--subset", help="SWEBench subset to use or path to a dataset", rich_help_panel="Data selection"),
    split: str = typer.Option("dev", "--split", help="Dataset split", rich_help_panel="Data selection"),
    slice_spec: str = typer.Option("", "--slice", help="Slice specification (e.g., '0:5' for first 5 instances)", rich_help_panel="Data selection"),
    filter_spec: str = typer.Option("", "--filter", help="Filter instance IDs by regex", rich_help_panel="Data selection"),
    shuffle: bool = typer.Option(False, "--shuffle", help="Shuffle instances", rich_help_panel="Data selection"),
    output: str = typer.Option("", "-o", "--output", help="Output directory", rich_help_panel="Basic"),
    workers: int = typer.Option(1, "-w", "--workers", help="Number of worker threads for parallel processing", rich_help_panel="Basic"),
    model: str | None = typer.Option(None, "-m", "--model", help="Model to use", rich_help_panel="Basic"),
    model_class: str | None = typer.Option(None, "-c", "--model-class", help="Model class to use (e.g., 'anthropic' or 'minisweagent.models.anthropic.AnthropicModel')", rich_help_panel="Advanced"),
    redo_existing: bool = typer.Option(False, "--redo-existing", help="Redo existing instances", rich_help_panel="Data selection"),
    config_spec: Path = typer.Option( builtin_config_dir / "extra" / "swebench.yaml", "-c", "--config", help="Path to a config file", rich_help_panel="Basic"),
    environment_class: str | None = typer.Option( None, "--environment-class", help="Environment type to use. Recommended are docker or singularity", rich_help_panel="Advanced"),
) -> None:
    # fmt: on
    output_path = Path(output)
    output_path.mkdir(parents=True, exist_ok=True)
    logger.info(f"Results will be saved to {output_path}")
    add_file_handler(output_path / "minisweagent.log")

    dataset_path = DATASET_MAPPING.get(subset, subset)
    logger.info(f"Loading dataset {dataset_path}, split {split}...")
    instances = list(load_dataset(dataset_path, split=split))

    instances = filter_instances(instances, filter_spec=filter_spec, slice_spec=slice_spec, shuffle=shuffle)
    if not redo_existing and (output_path / "preds.json").exists():
        existing_instances = list(json.loads((output_path / "preds.json").read_text()).keys())
        logger.info(f"Skipping {len(existing_instances)} existing instances")
        instances = [instance for instance in instances if instance["instance_id"] not in existing_instances]
    logger.info(f"Running on {len(instances)} instances...")

    config_path = get_config_path(config_spec)
    logger.info(f"Loading agent config from '{config_path}'")
    config = yaml.safe_load(config_path.read_text())
    if environment_class is not None:
        config.setdefault("environment", {})["environment_class"] = environment_class
    if model is not None:
        config.setdefault("model", {})["model_name"] = model
    if model_class is not None:
        config.setdefault("model", {})["model_class"] = model_class

    progress_manager = RunBatchProgressManager(len(instances), output_path / f"exit_statuses_{time.time()}.yaml")

    def process_futures(futures: dict[concurrent.futures.Future, str]):
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()
            except concurrent.futures.CancelledError:
                pass
            except Exception as e:
                instance_id = futures[future]
                logger.error(f"Error in future for instance {instance_id}: {e}", exc_info=True)
                progress_manager.on_uncaught_exception(instance_id, e)

    with Live(progress_manager.render_group, refresh_per_second=4):
        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
            futures = {
                executor.submit(process_instance, instance, output_path, config, progress_manager): instance[
                    "instance_id"
                ]
                for instance in instances
            }
            try:
                process_futures(futures)
            except KeyboardInterrupt:
                logger.info("Cancelling all pending jobs. Press ^C again to exit immediately.")
                for future in futures:
                    if not future.running() and not future.done():
                        future.cancel()
                process_futures(futures)