Zum Inhalt springen

Referenz fĂĽr ultralytics/data/explorer/explorer.py

Hinweis

Diese Datei ist verfügbar unter https://github.com/ultralytics/ ultralytics/blob/main/ ultralytics/data/explorer/explorer .py. Wenn du ein Problem entdeckst, hilf bitte, es zu beheben, indem du einen Pull Request 🛠️ einreichst. Vielen Dank 🙏!



ultralytics.data.explorer.explorer.ExplorerDataset

Basen: YOLODataset

Quellcode in ultralytics/data/explorer/explorer.py
class ExplorerDataset(YOLODataset):
    def __init__(self, *args, data: dict = None, **kwargs) -> None:
        super().__init__(*args, data=data, **kwargs)

    def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
        """Loads 1 image from dataset index 'i' without any resize ops."""
        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
        if im is None:  # not cached in RAM
            if fn.exists():  # load npy
                im = np.load(fn)
            else:  # read image
                im = cv2.imread(f)  # BGR
                if im is None:
                    raise FileNotFoundError(f"Image Not Found {f}")
            h0, w0 = im.shape[:2]  # orig hw
            return im, (h0, w0), im.shape[:2]

        return self.ims[i], self.im_hw0[i], self.im_hw[i]

    def build_transforms(self, hyp: IterableSimpleNamespace = None):
        """Creates transforms for dataset images without resizing."""
        return Format(
            bbox_format="xyxy",
            normalize=False,
            return_mask=self.use_segments,
            return_keypoint=self.use_keypoints,
            batch_idx=True,
            mask_ratio=hyp.mask_ratio,
            mask_overlap=hyp.overlap_mask,
        )

build_transforms(hyp=None)

Erzeugt Transformationen für Datensatzbilder ohne Größenänderung.

Quellcode in ultralytics/data/explorer/explorer.py
def build_transforms(self, hyp: IterableSimpleNamespace = None):
    """Creates transforms for dataset images without resizing."""
    return Format(
        bbox_format="xyxy",
        normalize=False,
        return_mask=self.use_segments,
        return_keypoint=self.use_keypoints,
        batch_idx=True,
        mask_ratio=hyp.mask_ratio,
        mask_overlap=hyp.overlap_mask,
    )

load_image(i)

Lädt 1 Bild aus dem Datensatzindex 'i' ohne Größenänderung.

Quellcode in ultralytics/data/explorer/explorer.py
def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
    """Loads 1 image from dataset index 'i' without any resize ops."""
    im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
    if im is None:  # not cached in RAM
        if fn.exists():  # load npy
            im = np.load(fn)
        else:  # read image
            im = cv2.imread(f)  # BGR
            if im is None:
                raise FileNotFoundError(f"Image Not Found {f}")
        h0, w0 = im.shape[:2]  # orig hw
        return im, (h0, w0), im.shape[:2]

    return self.ims[i], self.im_hw0[i], self.im_hw[i]



ultralytics.data.explorer.explorer.Explorer

Quellcode in ultralytics/data/explorer/explorer.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
class Explorer:
    def __init__(
        self,
        data: Union[str, Path] = "coco128.yaml",
        model: str = "yolov8n.pt",
        uri: str = USER_CONFIG_DIR / "explorer",
    ) -> None:
        # Note duckdb==0.10.0 bug https://github.com/ultralytics/ultralytics/pull/8181
        checks.check_requirements(["lancedb>=0.4.3", "duckdb<=0.9.2"])
        import lancedb

        self.connection = lancedb.connect(uri)
        self.table_name = Path(data).name.lower() + "_" + model.lower()
        self.sim_idx_base_name = (
            f"{self.table_name}_sim_idx".lower()
        )  # Use this name and append thres and top_k to reuse the table
        self.model = YOLO(model)
        self.data = data  # None
        self.choice_set = None

        self.table = None
        self.progress = 0

    def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
        """
        Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
        already exists. Pass force=True to overwrite the existing table.

        Args:
            force (bool): Whether to overwrite the existing table or not. Defaults to False.
            split (str): Split of the dataset to use. Defaults to 'train'.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            ```
        """
        if self.table is not None and not force:
            LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
            return
        if self.table_name in self.connection.table_names() and not force:
            LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
            self.table = self.connection.open_table(self.table_name)
            self.progress = 1
            return
        if self.data is None:
            raise ValueError("Data must be provided to create embeddings table")

        data_info = check_det_dataset(self.data)
        if split not in data_info:
            raise ValueError(
                f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
            )

        choice_set = data_info[split]
        choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
        self.choice_set = choice_set
        dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)

        # Create the table schema
        batch = dataset[0]
        vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
        table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
        table.add(
            self._yield_batches(
                dataset,
                data_info,
                self.model,
                exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
            )
        )

        self.table = table

    def _yield_batches(self, dataset: ExplorerDataset, data_info: dict, model: YOLO, exclude_keys: List[str]):
        """Generates batches of data for embedding, excluding specified keys."""
        for i in tqdm(range(len(dataset))):
            self.progress = float(i + 1) / len(dataset)
            batch = dataset[i]
            for k in exclude_keys:
                batch.pop(k, None)
            batch = sanitize_batch(batch, data_info)
            batch["vector"] = model.embed(batch["im_file"], verbose=False)[0].detach().tolist()
            yield [batch]

    def query(
        self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
    ) -> Any:  # pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

        Args:
            imgs (str or list): Path to the image or a list of paths to the images.
            limit (int): Number of results to return.

        Returns:
            (pyarrow.Table): An arrow table containing the results. Supports converting to:
                - pandas dataframe: `result.to_pandas()`
                - dict of lists: `result.to_pydict()`

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")
        if isinstance(imgs, str):
            imgs = [imgs]
        assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
        embeds = self.model.embed(imgs)
        # Get avg if multiple images are passed (len > 1)
        embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
        return self.table.search(embeds).limit(limit).to_arrow()

    def sql_query(
        self, query: str, return_type: str = "pandas"
    ) -> Union[Any, None]:  # pandas.DataFrame or pyarrow.Table
        """
        Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.

        Args:
            query (str): SQL query to run.
            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

        Returns:
            (pyarrow.Table): An arrow table containing the results.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
            result = exp.sql_query(query)
            ```
        """
        assert return_type in {
            "pandas",
            "arrow",
        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        import duckdb

        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")

        # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
        table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
        if not query.startswith("SELECT") and not query.startswith("WHERE"):
            raise ValueError(
                f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE "
                f"clause. found {query}"
            )
        if query.startswith("WHERE"):
            query = f"SELECT * FROM 'table' {query}"
        LOGGER.info(f"Running query: {query}")

        rs = duckdb.sql(query)
        if return_type == "arrow":
            return rs.arrow()
        elif return_type == "pandas":
            return rs.df()

    def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
        """
        Plot the results of a SQL-Like query on the table.
        Args:
            query (str): SQL query to run.
            labels (bool): Whether to plot the labels or not.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
            result = exp.plot_sql_query(query)
            ```
        """
        result = self.sql_query(query, return_type="arrow")
        if len(result) == 0:
            LOGGER.info("No results found.")
            return None
        img = plot_query_result(result, plot_labels=labels)
        return Image.fromarray(img)

    def get_similar(
        self,
        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
        idx: Union[int, List[int]] = None,
        limit: int = 25,
        return_type: str = "pandas",
    ) -> Any:  # pandas.DataFrame or pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

        Args:
            img (str or list): Path to the image or a list of paths to the images.
            idx (int or list): Index of the image in the table or a list of indexes.
            limit (int): Number of results to return. Defaults to 25.
            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

        Returns:
            (pandas.DataFrame): A dataframe containing the results.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        assert return_type in {
            "pandas",
            "arrow",
        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        img = self._check_imgs_or_idxs(img, idx)
        similar = self.query(img, limit=limit)

        if return_type == "arrow":
            return similar
        elif return_type == "pandas":
            return similar.to_pandas()

    def plot_similar(
        self,
        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
        idx: Union[int, List[int]] = None,
        limit: int = 25,
        labels: bool = True,
    ) -> Image.Image:
        """
        Plot the similar images. Accepts images or indexes.

        Args:
            img (str or list): Path to the image or a list of paths to the images.
            idx (int or list): Index of the image in the table or a list of indexes.
            labels (bool): Whether to plot the labels or not.
            limit (int): Number of results to return. Defaults to 25.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        similar = self.get_similar(img, idx, limit, return_type="arrow")
        if len(similar) == 0:
            LOGGER.info("No results found.")
            return None
        img = plot_query_result(similar, plot_labels=labels)
        return Image.fromarray(img)

    def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Any:  # pd.DataFrame
        """
        Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
        are max_dist or closer to the image in the embedding space at a given index.

        Args:
            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
            top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit.
                           vector search. Defaults: None.
            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

        Returns:
            (pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image,
                and columns include indices of similar images and their respective distances.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            sim_idx = exp.similarity_index()
            ```
        """
        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")
        sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
        if sim_idx_table_name in self.connection.table_names() and not force:
            LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
            return self.connection.open_table(sim_idx_table_name).to_pandas()

        if top_k and not (1.0 >= top_k >= 0.0):
            raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
        if max_dist < 0.0:
            raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")

        top_k = int(top_k * len(self.table)) if top_k else len(self.table)
        top_k = max(top_k, 1)
        features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
        im_files = features["im_file"]
        embeddings = features["vector"]

        sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")

        def _yield_sim_idx():
            """Generates a dataframe with similarity indices and distances for images."""
            for i in tqdm(range(len(embeddings))):
                sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
                yield [
                    {
                        "idx": i,
                        "im_file": im_files[i],
                        "count": len(sim_idx),
                        "sim_im_files": sim_idx["im_file"].tolist(),
                    }
                ]

        sim_table.add(_yield_sim_idx())
        self.sim_index = sim_table
        return sim_table.to_pandas()

    def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
        """
        Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
        max_dist or closer to the image in the embedding space at a given index.

        Args:
            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
            top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
                running vector search. Defaults to 0.01.
            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()

            similarity_idx_plot = exp.plot_similarity_index()
            similarity_idx_plot.show() # view image preview
            similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
            ```
        """
        sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
        sim_count = sim_idx["count"].tolist()
        sim_count = np.array(sim_count)

        indices = np.arange(len(sim_count))

        # Create the bar plot
        plt.bar(indices, sim_count)

        # Customize the plot (optional)
        plt.xlabel("data idx")
        plt.ylabel("Count")
        plt.title("Similarity Count")
        buffer = BytesIO()
        plt.savefig(buffer, format="png")
        buffer.seek(0)

        # Use Pillow to open the image from the buffer
        return Image.fromarray(np.array(Image.open(buffer)))

    def _check_imgs_or_idxs(
        self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None], idx: Union[None, int, List[int]]
    ) -> List[np.ndarray]:
        if img is None and idx is None:
            raise ValueError("Either img or idx must be provided.")
        if img is not None and idx is not None:
            raise ValueError("Only one of img or idx must be provided.")
        if idx is not None:
            idx = idx if isinstance(idx, list) else [idx]
            img = self.table.to_lance().take(idx, columns=["im_file"]).to_pydict()["im_file"]

        return img if isinstance(img, list) else [img]

    def ask_ai(self, query):
        """
        Ask AI a question.

        Args:
            query (str): Question to ask.

        Returns:
            (pandas.DataFrame): A dataframe containing filtered results to the SQL query.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            answer = exp.ask_ai('Show images with 1 person and 2 dogs')
            ```
        """
        result = prompt_sql_query(query)
        try:
            return self.sql_query(result)
        except Exception as e:
            LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
            LOGGER.error(e)
            return None

    def visualize(self, result):
        """
        Visualize the results of a query. TODO.

        Args:
            result (pyarrow.Table): Table containing the results of a query.
        """
        pass

    def generate_report(self, result):
        """
        Generate a report of the dataset.

        TODO
        """
        pass

ask_ai(query)

Stell AI eine Frage.

Parameter:

Name Typ Beschreibung Standard
query str

Frage an dich.

erforderlich

Retouren:

Typ Beschreibung
DataFrame

Ein Datenrahmen mit gefilterten Ergebnissen der SQL-Abfrage.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
answer = exp.ask_ai('Show images with 1 person and 2 dogs')
Quellcode in ultralytics/data/explorer/explorer.py
def ask_ai(self, query):
    """
    Ask AI a question.

    Args:
        query (str): Question to ask.

    Returns:
        (pandas.DataFrame): A dataframe containing filtered results to the SQL query.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        answer = exp.ask_ai('Show images with 1 person and 2 dogs')
        ```
    """
    result = prompt_sql_query(query)
    try:
        return self.sql_query(result)
    except Exception as e:
        LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
        LOGGER.error(e)
        return None

create_embeddings_table(force=False, split='train')

Erstelle eine LanceDB-Tabelle, die die Einbettungen der Bilder im Datensatz enthält. Die Tabelle wird wiederverwendet, wenn sie bereits existiert. Gib force=True an, um die bestehende Tabelle zu überschreiben.

Parameter:

Name Typ Beschreibung Standard
force bool

Ob die bestehende Tabelle ĂĽberschrieben werden soll oder nicht. Der Standardwert ist False.

False
split str

Aufteilung des zu verwendenden Datensatzes. Der Standardwert ist "train".

'train'
Beispiel
exp = Explorer()
exp.create_embeddings_table()
Quellcode in ultralytics/data/explorer/explorer.py
def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
    """
    Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
    already exists. Pass force=True to overwrite the existing table.

    Args:
        force (bool): Whether to overwrite the existing table or not. Defaults to False.
        split (str): Split of the dataset to use. Defaults to 'train'.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        ```
    """
    if self.table is not None and not force:
        LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
        return
    if self.table_name in self.connection.table_names() and not force:
        LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
        self.table = self.connection.open_table(self.table_name)
        self.progress = 1
        return
    if self.data is None:
        raise ValueError("Data must be provided to create embeddings table")

    data_info = check_det_dataset(self.data)
    if split not in data_info:
        raise ValueError(
            f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
        )

    choice_set = data_info[split]
    choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
    self.choice_set = choice_set
    dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)

    # Create the table schema
    batch = dataset[0]
    vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
    table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
    table.add(
        self._yield_batches(
            dataset,
            data_info,
            self.model,
            exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
        )
    )

    self.table = table

generate_report(result)

Erstelle einen Bericht ĂĽber den Datensatz.

TODO

Quellcode in ultralytics/data/explorer/explorer.py
def generate_report(self, result):
    """
    Generate a report of the dataset.

    TODO
    """
    pass

get_similar(img=None, idx=None, limit=25, return_type='pandas')

Frag die Tabelle nach ähnlichen Bildern ab. Akzeptiert ein einzelnes Bild oder eine Liste von Bildern.

Parameter:

Name Typ Beschreibung Standard
img str or list

Pfad zum Bild oder eine Liste von Pfaden zu den Bildern.

None
idx int or list

Index des Bildes in der Tabelle oder eine Liste von Indizes.

None
limit int

Anzahl der Ergebnisse, die zurĂĽckgegeben werden. Der Standardwert ist 25.

25
return_type str

Typ des Ergebnisses, das zurĂĽckgegeben werden soll. Kann entweder "pandas" oder "arrow" sein. Der Standardwert ist "pandas".

'pandas'

Retouren:

Typ Beschreibung
DataFrame

Ein Datenrahmen, der die Ergebnisse enthält.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
Quellcode in ultralytics/data/explorer/explorer.py
def get_similar(
    self,
    img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
    idx: Union[int, List[int]] = None,
    limit: int = 25,
    return_type: str = "pandas",
) -> Any:  # pandas.DataFrame or pyarrow.Table
    """
    Query the table for similar images. Accepts a single image or a list of images.

    Args:
        img (str or list): Path to the image or a list of paths to the images.
        idx (int or list): Index of the image in the table or a list of indexes.
        limit (int): Number of results to return. Defaults to 25.
        return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

    Returns:
        (pandas.DataFrame): A dataframe containing the results.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    assert return_type in {
        "pandas",
        "arrow",
    }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
    img = self._check_imgs_or_idxs(img, idx)
    similar = self.query(img, limit=limit)

    if return_type == "arrow":
        return similar
    elif return_type == "pandas":
        return similar.to_pandas()

plot_similar(img=None, idx=None, limit=25, labels=True)

Zeichne die ähnlichen Bilder auf. Akzeptiert Bilder oder Indizes.

Parameter:

Name Typ Beschreibung Standard
img str or list

Pfad zum Bild oder eine Liste von Pfaden zu den Bildern.

None
idx int or list

Index des Bildes in der Tabelle oder eine Liste von Indizes.

None
labels bool

Ob die Beschriftungen geplottet werden sollen oder nicht.

True
limit int

Anzahl der Ergebnisse, die zurĂĽckgegeben werden. Der Standardwert ist 25.

25

Retouren:

Typ Beschreibung
Image

Bild, das den Plot enthält.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
Quellcode in ultralytics/data/explorer/explorer.py
def plot_similar(
    self,
    img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
    idx: Union[int, List[int]] = None,
    limit: int = 25,
    labels: bool = True,
) -> Image.Image:
    """
    Plot the similar images. Accepts images or indexes.

    Args:
        img (str or list): Path to the image or a list of paths to the images.
        idx (int or list): Index of the image in the table or a list of indexes.
        labels (bool): Whether to plot the labels or not.
        limit (int): Number of results to return. Defaults to 25.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    similar = self.get_similar(img, idx, limit, return_type="arrow")
    if len(similar) == 0:
        LOGGER.info("No results found.")
        return None
    img = plot_query_result(similar, plot_labels=labels)
    return Image.fromarray(img)

plot_similarity_index(max_dist=0.2, top_k=None, force=False)

Stelle den Ähnlichkeitsindex für alle Bilder in der Tabelle dar. Dabei enthält der Index die Datenpunkte, die max_dist oder näher am Bild im Einbettungsraum bei einem bestimmten Index liegen.

Parameter:

Name Typ Beschreibung Standard
max_dist float

maximaler L2-Abstand zwischen den zu berĂĽcksichtigenden Einbettungen. Der Standardwert ist 0,2.

0.2
top_k float

Prozentualer Anteil der nächstgelegenen Datenpunkte, die bei der Zählung berücksichtigt werden. Wird verwendet, um die Grenze bei der der Vektorsuche. Der Standardwert ist 0,01.

None
force bool

Ob der bestehende Ă„hnlichkeitsindex ĂĽberschrieben werden soll oder nicht. Der Standardwert ist True.

False

Retouren:

Typ Beschreibung
Image

Bild, das den Plot enthält.

Beispiel
exp = Explorer()
exp.create_embeddings_table()

similarity_idx_plot = exp.plot_similarity_index()
similarity_idx_plot.show() # view image preview
similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
Quellcode in ultralytics/data/explorer/explorer.py
def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
    """
    Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
    max_dist or closer to the image in the embedding space at a given index.

    Args:
        max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
        top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
            running vector search. Defaults to 0.01.
        force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()

        similarity_idx_plot = exp.plot_similarity_index()
        similarity_idx_plot.show() # view image preview
        similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
        ```
    """
    sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
    sim_count = sim_idx["count"].tolist()
    sim_count = np.array(sim_count)

    indices = np.arange(len(sim_count))

    # Create the bar plot
    plt.bar(indices, sim_count)

    # Customize the plot (optional)
    plt.xlabel("data idx")
    plt.ylabel("Count")
    plt.title("Similarity Count")
    buffer = BytesIO()
    plt.savefig(buffer, format="png")
    buffer.seek(0)

    # Use Pillow to open the image from the buffer
    return Image.fromarray(np.array(Image.open(buffer)))

plot_sql_query(query, labels=True)

Stellt die Ergebnisse einer SQL-ähnlichen Abfrage in der Tabelle dar. Args: query (str): SQL-Abfrage, die ausgeführt werden soll. labels (bool): Ob die Beschriftungen geplottet werden sollen oder nicht.

Retouren:

Typ Beschreibung
Image

Bild, das den Plot enthält.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.plot_sql_query(query)
Quellcode in ultralytics/data/explorer/explorer.py
def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
    """
    Plot the results of a SQL-Like query on the table.
    Args:
        query (str): SQL query to run.
        labels (bool): Whether to plot the labels or not.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
        result = exp.plot_sql_query(query)
        ```
    """
    result = self.sql_query(query, return_type="arrow")
    if len(result) == 0:
        LOGGER.info("No results found.")
        return None
    img = plot_query_result(result, plot_labels=labels)
    return Image.fromarray(img)

query(imgs=None, limit=25)

Frag die Tabelle nach ähnlichen Bildern ab. Akzeptiert ein einzelnes Bild oder eine Liste von Bildern.

Parameter:

Name Typ Beschreibung Standard
imgs str or list

Pfad zum Bild oder eine Liste von Pfaden zu den Bildern.

None
limit int

Anzahl der Ergebnisse, die zurĂĽckgegeben werden sollen.

25

Retouren:

Typ Beschreibung
Table

Eine Pfeiltabelle, die die Ergebnisse enthält. Unterstützt die Konvertierung in: - Pandas Dataframe: result.to_pandas() - Diktat der Listen: result.to_pydict()

Beispiel
exp = Explorer()
exp.create_embeddings_table()
similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
Quellcode in ultralytics/data/explorer/explorer.py
def query(
    self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
) -> Any:  # pyarrow.Table
    """
    Query the table for similar images. Accepts a single image or a list of images.

    Args:
        imgs (str or list): Path to the image or a list of paths to the images.
        limit (int): Number of results to return.

    Returns:
        (pyarrow.Table): An arrow table containing the results. Supports converting to:
            - pandas dataframe: `result.to_pandas()`
            - dict of lists: `result.to_pydict()`

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")
    if isinstance(imgs, str):
        imgs = [imgs]
    assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
    embeds = self.model.embed(imgs)
    # Get avg if multiple images are passed (len > 1)
    embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
    return self.table.search(embeds).limit(limit).to_arrow()

similarity_index(max_dist=0.2, top_k=None, force=False)

Berechne den Ähnlichkeitsindex für alle Bilder in der Tabelle. Dabei enthält der Index die Datenpunkte, die max_dist oder näher am Bild im Einbettungsraum bei einem bestimmten Index liegen.

Parameter:

Name Typ Beschreibung Standard
max_dist float

maximaler L2-Abstand zwischen den zu berĂĽcksichtigenden Einbettungen. Der Standardwert ist 0,2.

0.2
top_k float

Prozentualer Anteil der nächstgelegenen Datenpunkte, die bei der Zählung berücksichtigt werden. Wird verwendet, um die Grenze anzuwenden. Vektorsuche. Standardwerte: Keine.

None
force bool

Ob der bestehende Ă„hnlichkeitsindex ĂĽberschrieben werden soll oder nicht. Der Standardwert ist True.

False

Retouren:

Typ Beschreibung
DataFrame

Ein Datenrahmen, der den Ähnlichkeitsindex enthält. Jede Zeile entspricht einem Bild, und die Spalten enthalten die Indizes ähnlicher Bilder und ihre jeweiligen Abstände.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
sim_idx = exp.similarity_index()
Quellcode in ultralytics/data/explorer/explorer.py
def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Any:  # pd.DataFrame
    """
    Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
    are max_dist or closer to the image in the embedding space at a given index.

    Args:
        max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
        top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit.
                       vector search. Defaults: None.
        force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

    Returns:
        (pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image,
            and columns include indices of similar images and their respective distances.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        sim_idx = exp.similarity_index()
        ```
    """
    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")
    sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
    if sim_idx_table_name in self.connection.table_names() and not force:
        LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
        return self.connection.open_table(sim_idx_table_name).to_pandas()

    if top_k and not (1.0 >= top_k >= 0.0):
        raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
    if max_dist < 0.0:
        raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")

    top_k = int(top_k * len(self.table)) if top_k else len(self.table)
    top_k = max(top_k, 1)
    features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
    im_files = features["im_file"]
    embeddings = features["vector"]

    sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")

    def _yield_sim_idx():
        """Generates a dataframe with similarity indices and distances for images."""
        for i in tqdm(range(len(embeddings))):
            sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
            yield [
                {
                    "idx": i,
                    "im_file": im_files[i],
                    "count": len(sim_idx),
                    "sim_im_files": sim_idx["im_file"].tolist(),
                }
            ]

    sim_table.add(_yield_sim_idx())
    self.sim_index = sim_table
    return sim_table.to_pandas()

sql_query(query, return_type='pandas')

Führe eine SQL-ähnliche Abfrage auf die Tabelle aus. Verwendet das LanceDB-Prädikat Pushdown.

Parameter:

Name Typ Beschreibung Standard
query str

SQL-Abfrage, die ausgefĂĽhrt werden soll.

erforderlich
return_type str

Typ des Ergebnisses, das zurĂĽckgegeben werden soll. Kann entweder "pandas" oder "arrow" sein. Der Standardwert ist "pandas".

'pandas'

Retouren:

Typ Beschreibung
Table

Eine Pfeiltabelle, die die Ergebnisse enthält.

Beispiel
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.sql_query(query)
Quellcode in ultralytics/data/explorer/explorer.py
def sql_query(
    self, query: str, return_type: str = "pandas"
) -> Union[Any, None]:  # pandas.DataFrame or pyarrow.Table
    """
    Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.

    Args:
        query (str): SQL query to run.
        return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

    Returns:
        (pyarrow.Table): An arrow table containing the results.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
        result = exp.sql_query(query)
        ```
    """
    assert return_type in {
        "pandas",
        "arrow",
    }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
    import duckdb

    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")

    # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
    table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
    if not query.startswith("SELECT") and not query.startswith("WHERE"):
        raise ValueError(
            f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE "
            f"clause. found {query}"
        )
    if query.startswith("WHERE"):
        query = f"SELECT * FROM 'table' {query}"
    LOGGER.info(f"Running query: {query}")

    rs = duckdb.sql(query)
    if return_type == "arrow":
        return rs.arrow()
    elif return_type == "pandas":
        return rs.df()

visualize(result)

Visualisiere die Ergebnisse einer Abfrage. TODO.

Parameter:

Name Typ Beschreibung Standard
result Table

Tabelle, die die Ergebnisse einer Abfrage enthält.

erforderlich
Quellcode in ultralytics/data/explorer/explorer.py
def visualize(self, result):
    """
    Visualize the results of a query. TODO.

    Args:
        result (pyarrow.Table): Table containing the results of a query.
    """
    pass





Erstellt am 2024-01-10, Aktualisiert am 2024-05-08
Autoren: Burhan-Q (1), glenn-jocher (1)

Kommentare