コンテンツへスキップ

参考 ultralytics/data/explorer/explorer.py

備考

このファイルはhttps://github.com/ultralytics/ultralytics/blob/main/ ultralytics/data/explorer/explorer .py にあります。もし問題を発見したら、Pull Request🛠️ を投稿して修正にご協力ください。ありがとうございました!



ultralytics.data.explorer.explorer.ExplorerDataset

ベース: YOLODataset

ソースコード ultralytics/data/explorer/explorer.py
class ExplorerDataset(YOLODataset):
    def __init__(self, *args, data: dict = None, **kwargs) -> None:
        super().__init__(*args, data=data, **kwargs)

    def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
        """Loads 1 image from dataset index 'i' without any resize ops."""
        im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
        if im is None:  # not cached in RAM
            if fn.exists():  # load npy
                im = np.load(fn)
            else:  # read image
                im = cv2.imread(f)  # BGR
                if im is None:
                    raise FileNotFoundError(f"Image Not Found {f}")
            h0, w0 = im.shape[:2]  # orig hw
            return im, (h0, w0), im.shape[:2]

        return self.ims[i], self.im_hw0[i], self.im_hw[i]

    def build_transforms(self, hyp: IterableSimpleNamespace = None):
        """Creates transforms for dataset images without resizing."""
        return Format(
            bbox_format="xyxy",
            normalize=False,
            return_mask=self.use_segments,
            return_keypoint=self.use_keypoints,
            batch_idx=True,
            mask_ratio=hyp.mask_ratio,
            mask_overlap=hyp.overlap_mask,
        )

build_transforms(hyp=None)

データセット画像の変換を、リサイズなしで行います。

ソースコード ultralytics/data/explorer/explorer.py
def build_transforms(self, hyp: IterableSimpleNamespace = None):
    """Creates transforms for dataset images without resizing."""
    return Format(
        bbox_format="xyxy",
        normalize=False,
        return_mask=self.use_segments,
        return_keypoint=self.use_keypoints,
        batch_idx=True,
        mask_ratio=hyp.mask_ratio,
        mask_overlap=hyp.overlap_mask,
    )

load_image(i)

データセットのインデックス 'i' から、リサイズ操作なしで画像を1枚読み込みます。

ソースコード ultralytics/data/explorer/explorer.py
def load_image(self, i: int) -> Union[Tuple[np.ndarray, Tuple[int, int], Tuple[int, int]], Tuple[None, None, None]]:
    """Loads 1 image from dataset index 'i' without any resize ops."""
    im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i]
    if im is None:  # not cached in RAM
        if fn.exists():  # load npy
            im = np.load(fn)
        else:  # read image
            im = cv2.imread(f)  # BGR
            if im is None:
                raise FileNotFoundError(f"Image Not Found {f}")
        h0, w0 = im.shape[:2]  # orig hw
        return im, (h0, w0), im.shape[:2]

    return self.ims[i], self.im_hw0[i], self.im_hw[i]



ultralytics.data.explorer.explorer.Explorer

ソースコード ultralytics/data/explorer/explorer.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
class Explorer:
    def __init__(
        self,
        data: Union[str, Path] = "coco128.yaml",
        model: str = "yolov8n.pt",
        uri: str = USER_CONFIG_DIR / "explorer",
    ) -> None:
        # Note duckdb==0.10.0 bug https://github.com/ultralytics/ultralytics/pull/8181
        checks.check_requirements(["lancedb>=0.4.3", "duckdb<=0.9.2"])
        import lancedb

        self.connection = lancedb.connect(uri)
        self.table_name = Path(data).name.lower() + "_" + model.lower()
        self.sim_idx_base_name = (
            f"{self.table_name}_sim_idx".lower()
        )  # Use this name and append thres and top_k to reuse the table
        self.model = YOLO(model)
        self.data = data  # None
        self.choice_set = None

        self.table = None
        self.progress = 0

    def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
        """
        Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
        already exists. Pass force=True to overwrite the existing table.

        Args:
            force (bool): Whether to overwrite the existing table or not. Defaults to False.
            split (str): Split of the dataset to use. Defaults to 'train'.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            ```
        """
        if self.table is not None and not force:
            LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
            return
        if self.table_name in self.connection.table_names() and not force:
            LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
            self.table = self.connection.open_table(self.table_name)
            self.progress = 1
            return
        if self.data is None:
            raise ValueError("Data must be provided to create embeddings table")

        data_info = check_det_dataset(self.data)
        if split not in data_info:
            raise ValueError(
                f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
            )

        choice_set = data_info[split]
        choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
        self.choice_set = choice_set
        dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)

        # Create the table schema
        batch = dataset[0]
        vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
        table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
        table.add(
            self._yield_batches(
                dataset,
                data_info,
                self.model,
                exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
            )
        )

        self.table = table

    def _yield_batches(self, dataset: ExplorerDataset, data_info: dict, model: YOLO, exclude_keys: List[str]):
        """Generates batches of data for embedding, excluding specified keys."""
        for i in tqdm(range(len(dataset))):
            self.progress = float(i + 1) / len(dataset)
            batch = dataset[i]
            for k in exclude_keys:
                batch.pop(k, None)
            batch = sanitize_batch(batch, data_info)
            batch["vector"] = model.embed(batch["im_file"], verbose=False)[0].detach().tolist()
            yield [batch]

    def query(
        self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
    ) -> Any:  # pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

        Args:
            imgs (str or list): Path to the image or a list of paths to the images.
            limit (int): Number of results to return.

        Returns:
            (pyarrow.Table): An arrow table containing the results. Supports converting to:
                - pandas dataframe: `result.to_pandas()`
                - dict of lists: `result.to_pydict()`

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")
        if isinstance(imgs, str):
            imgs = [imgs]
        assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
        embeds = self.model.embed(imgs)
        # Get avg if multiple images are passed (len > 1)
        embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
        return self.table.search(embeds).limit(limit).to_arrow()

    def sql_query(
        self, query: str, return_type: str = "pandas"
    ) -> Union[DataFrame, Any, None]:  # pandas.dataframe or pyarrow.Table
        """
        Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.

        Args:
            query (str): SQL query to run.
            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

        Returns:
            (pyarrow.Table): An arrow table containing the results.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
            result = exp.sql_query(query)
            ```
        """
        assert return_type in {
            "pandas",
            "arrow",
        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        import duckdb

        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")

        # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
        table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
        if not query.startswith("SELECT") and not query.startswith("WHERE"):
            raise ValueError(
                f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
            )
        if query.startswith("WHERE"):
            query = f"SELECT * FROM 'table' {query}"
        LOGGER.info(f"Running query: {query}")

        rs = duckdb.sql(query)
        if return_type == "arrow":
            return rs.arrow()
        elif return_type == "pandas":
            return rs.df()

    def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
        """
        Plot the results of a SQL-Like query on the table.
        Args:
            query (str): SQL query to run.
            labels (bool): Whether to plot the labels or not.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
            result = exp.plot_sql_query(query)
            ```
        """
        result = self.sql_query(query, return_type="arrow")
        if len(result) == 0:
            LOGGER.info("No results found.")
            return None
        img = plot_query_result(result, plot_labels=labels)
        return Image.fromarray(img)

    def get_similar(
        self,
        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
        idx: Union[int, List[int]] = None,
        limit: int = 25,
        return_type: str = "pandas",
    ) -> Union[DataFrame, Any]:  # pandas.dataframe or pyarrow.Table
        """
        Query the table for similar images. Accepts a single image or a list of images.

        Args:
            img (str or list): Path to the image or a list of paths to the images.
            idx (int or list): Index of the image in the table or a list of indexes.
            limit (int): Number of results to return. Defaults to 25.
            return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

        Returns:
            (pandas.DataFrame): A dataframe containing the results.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        assert return_type in {
            "pandas",
            "arrow",
        }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
        img = self._check_imgs_or_idxs(img, idx)
        similar = self.query(img, limit=limit)

        if return_type == "arrow":
            return similar
        elif return_type == "pandas":
            return similar.to_pandas()

    def plot_similar(
        self,
        img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
        idx: Union[int, List[int]] = None,
        limit: int = 25,
        labels: bool = True,
    ) -> Image.Image:
        """
        Plot the similar images. Accepts images or indexes.

        Args:
            img (str or list): Path to the image or a list of paths to the images.
            idx (int or list): Index of the image in the table or a list of indexes.
            labels (bool): Whether to plot the labels or not.
            limit (int): Number of results to return. Defaults to 25.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
            ```
        """
        similar = self.get_similar(img, idx, limit, return_type="arrow")
        if len(similar) == 0:
            LOGGER.info("No results found.")
            return None
        img = plot_query_result(similar, plot_labels=labels)
        return Image.fromarray(img)

    def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> DataFrame:
        """
        Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
        are max_dist or closer to the image in the embedding space at a given index.

        Args:
            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
            top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit when running
                           vector search. Defaults: None.
            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

        Returns:
            (pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image, and columns
                                include indices of similar images and their respective distances.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            sim_idx = exp.similarity_index()
            ```
        """
        if self.table is None:
            raise ValueError("Table is not created. Please create the table first.")
        sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
        if sim_idx_table_name in self.connection.table_names() and not force:
            LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
            return self.connection.open_table(sim_idx_table_name).to_pandas()

        if top_k and not (1.0 >= top_k >= 0.0):
            raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
        if max_dist < 0.0:
            raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")

        top_k = int(top_k * len(self.table)) if top_k else len(self.table)
        top_k = max(top_k, 1)
        features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
        im_files = features["im_file"]
        embeddings = features["vector"]

        sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")

        def _yield_sim_idx():
            """Generates a dataframe with similarity indices and distances for images."""
            for i in tqdm(range(len(embeddings))):
                sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
                yield [
                    {
                        "idx": i,
                        "im_file": im_files[i],
                        "count": len(sim_idx),
                        "sim_im_files": sim_idx["im_file"].tolist(),
                    }
                ]

        sim_table.add(_yield_sim_idx())
        self.sim_index = sim_table
        return sim_table.to_pandas()

    def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
        """
        Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
        max_dist or closer to the image in the embedding space at a given index.

        Args:
            max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
            top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
                running vector search. Defaults to 0.01.
            force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

        Returns:
            (PIL.Image): Image containing the plot.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()

            similarity_idx_plot = exp.plot_similarity_index()
            similarity_idx_plot.show() # view image preview
            similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
            ```
        """
        sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
        sim_count = sim_idx["count"].tolist()
        sim_count = np.array(sim_count)

        indices = np.arange(len(sim_count))

        # Create the bar plot
        plt.bar(indices, sim_count)

        # Customize the plot (optional)
        plt.xlabel("data idx")
        plt.ylabel("Count")
        plt.title("Similarity Count")
        buffer = BytesIO()
        plt.savefig(buffer, format="png")
        buffer.seek(0)

        # Use Pillow to open the image from the buffer
        return Image.fromarray(np.array(Image.open(buffer)))

    def _check_imgs_or_idxs(
        self, img: Union[str, np.ndarray, List[str], List[np.ndarray], None], idx: Union[None, int, List[int]]
    ) -> List[np.ndarray]:
        if img is None and idx is None:
            raise ValueError("Either img or idx must be provided.")
        if img is not None and idx is not None:
            raise ValueError("Only one of img or idx must be provided.")
        if idx is not None:
            idx = idx if isinstance(idx, list) else [idx]
            img = self.table.to_lance().take(idx, columns=["im_file"]).to_pydict()["im_file"]

        return img if isinstance(img, list) else [img]

    def ask_ai(self, query):
        """
        Ask AI a question.

        Args:
            query (str): Question to ask.

        Returns:
            (pandas.DataFrame): A dataframe containing filtered results to the SQL query.

        Example:
            ```python
            exp = Explorer()
            exp.create_embeddings_table()
            answer = exp.ask_ai('Show images with 1 person and 2 dogs')
            ```
        """
        result = prompt_sql_query(query)
        try:
            df = self.sql_query(result)
        except Exception as e:
            LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
            LOGGER.error(e)
            return None
        return df

    def visualize(self, result):
        """
        Visualize the results of a query. TODO.

        Args:
            result (pyarrow.Table): Table containing the results of a query.
        """
        pass

    def generate_report(self, result):
        """
        Generate a report of the dataset.

        TODO
        """
        pass

ask_ai(query)

AIに質問する

パラメーター

名称 タイプ 説明 デフォルト
query str

質問したい。

必須

リターンズ

タイプ 説明
DataFrame

SQL クエリでフィルタリングされた結果を含むデータフレーム。

例
exp = Explorer()
exp.create_embeddings_table()
answer = exp.ask_ai('Show images with 1 person and 2 dogs')
ソースコード ultralytics/data/explorer/explorer.py
def ask_ai(self, query):
    """
    Ask AI a question.

    Args:
        query (str): Question to ask.

    Returns:
        (pandas.DataFrame): A dataframe containing filtered results to the SQL query.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        answer = exp.ask_ai('Show images with 1 person and 2 dogs')
        ```
    """
    result = prompt_sql_query(query)
    try:
        df = self.sql_query(result)
    except Exception as e:
        LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
        LOGGER.error(e)
        return None
    return df

create_embeddings_table(force=False, split='train')

データセット中の画像の埋め込みを含むLanceDBテーブルを作成する。このテーブルは テーブルは再利用されます。既存のテーブルを上書きするには force=True を渡します。

パラメーター

名称 タイプ 説明 デフォルト
force bool

既存のテーブルを上書きするかどうか。デフォルトはFalse。

False
split str

使用するデータセットの分割。デフォルトは 'train'.

'train'
例
exp = Explorer()
exp.create_embeddings_table()
ソースコード ultralytics/data/explorer/explorer.py
def create_embeddings_table(self, force: bool = False, split: str = "train") -> None:
    """
    Create LanceDB table containing the embeddings of the images in the dataset. The table will be reused if it
    already exists. Pass force=True to overwrite the existing table.

    Args:
        force (bool): Whether to overwrite the existing table or not. Defaults to False.
        split (str): Split of the dataset to use. Defaults to 'train'.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        ```
    """
    if self.table is not None and not force:
        LOGGER.info("Table already exists. Reusing it. Pass force=True to overwrite it.")
        return
    if self.table_name in self.connection.table_names() and not force:
        LOGGER.info(f"Table {self.table_name} already exists. Reusing it. Pass force=True to overwrite it.")
        self.table = self.connection.open_table(self.table_name)
        self.progress = 1
        return
    if self.data is None:
        raise ValueError("Data must be provided to create embeddings table")

    data_info = check_det_dataset(self.data)
    if split not in data_info:
        raise ValueError(
            f"Split {split} is not found in the dataset. Available keys in the dataset are {list(data_info.keys())}"
        )

    choice_set = data_info[split]
    choice_set = choice_set if isinstance(choice_set, list) else [choice_set]
    self.choice_set = choice_set
    dataset = ExplorerDataset(img_path=choice_set, data=data_info, augment=False, cache=False, task=self.model.task)

    # Create the table schema
    batch = dataset[0]
    vector_size = self.model.embed(batch["im_file"], verbose=False)[0].shape[0]
    table = self.connection.create_table(self.table_name, schema=get_table_schema(vector_size), mode="overwrite")
    table.add(
        self._yield_batches(
            dataset,
            data_info,
            self.model,
            exclude_keys=["img", "ratio_pad", "resized_shape", "ori_shape", "batch_idx"],
        )
    )

    self.table = table

generate_report(result)

データセットのレポートを作成する。

TODO

ソースコード ultralytics/data/explorer/explorer.py
def generate_report(self, result):
    """
    Generate a report of the dataset.

    TODO
    """
    pass

get_similar(img=None, idx=None, limit=25, return_type='pandas')

類似画像をテーブルに問い合わせる。単一の画像または画像のリストを受け付けます。

パラメーター

名称 タイプ 説明 デフォルト
img str or list

画像へのパス、または画像へのパスのリスト。

None
idx int or list

テーブル内の画像のインデックス、またはインデックスのリスト。

None
limit int

返す結果の数。デフォルトは25。

25
return_type str

返す結果の型。pandas'または'arrow'のいずれかを指定する。デフォルトは 'pandas' です。

'pandas'

リターンズ

タイプ 説明
DataFrame

結果を含むデータフレーム。

例
exp = Explorer()
exp.create_embeddings_table()
similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
ソースコード ultralytics/data/explorer/explorer.py
def get_similar(
    self,
    img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
    idx: Union[int, List[int]] = None,
    limit: int = 25,
    return_type: str = "pandas",
) -> Union[DataFrame, Any]:  # pandas.dataframe or pyarrow.Table
    """
    Query the table for similar images. Accepts a single image or a list of images.

    Args:
        img (str or list): Path to the image or a list of paths to the images.
        idx (int or list): Index of the image in the table or a list of indexes.
        limit (int): Number of results to return. Defaults to 25.
        return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

    Returns:
        (pandas.DataFrame): A dataframe containing the results.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.get_similar(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    assert return_type in {
        "pandas",
        "arrow",
    }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
    img = self._check_imgs_or_idxs(img, idx)
    similar = self.query(img, limit=limit)

    if return_type == "arrow":
        return similar
    elif return_type == "pandas":
        return similar.to_pandas()

plot_similar(img=None, idx=None, limit=25, labels=True)

類似画像をプロットする。画像またはインデックスを受け付ける。

パラメーター

名称 タイプ 説明 デフォルト
img str or list

画像へのパス、または画像へのパスのリスト。

None
idx int or list

テーブル内の画像のインデックス、またはインデックスのリスト。

None
labels bool

ラベルをプロットするかどうか。

True
limit int

返す結果の数。デフォルトは25。

25

リターンズ

タイプ 説明
Image

プロットを含む画像。

例
exp = Explorer()
exp.create_embeddings_table()
similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
ソースコード ultralytics/data/explorer/explorer.py
def plot_similar(
    self,
    img: Union[str, np.ndarray, List[str], List[np.ndarray]] = None,
    idx: Union[int, List[int]] = None,
    limit: int = 25,
    labels: bool = True,
) -> Image.Image:
    """
    Plot the similar images. Accepts images or indexes.

    Args:
        img (str or list): Path to the image or a list of paths to the images.
        idx (int or list): Index of the image in the table or a list of indexes.
        labels (bool): Whether to plot the labels or not.
        limit (int): Number of results to return. Defaults to 25.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.plot_similar(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    similar = self.get_similar(img, idx, limit, return_type="arrow")
    if len(similar) == 0:
        LOGGER.info("No results found.")
        return None
    img = plot_query_result(similar, plot_labels=labels)
    return Image.fromarray(img)

plot_similarity_index(max_dist=0.2, top_k=None, force=False)

表中のすべての画像の類似度指数をプロットする。ここで、インデックスには、あるインデックスの埋め込み空間において max_distか、または、与えられたインデックスにおける埋め込み空間内の画像に近いデータ点が含まれる。

パラメーター

名称 タイプ 説明 デフォルト
max_dist float

考慮する埋め込み間の最大L2距離。デフォルトは0.2。

0.2
top_k float

カウントする際に考慮する最も近いデータポイントの割合。ベクトル検索を実行する際に ベクトル検索を実行するときに制限を適用する。デフォルトは0.01。

None
force bool

既存の類似性インデックスを上書きするかどうか。デフォルトはTrue。

False

リターンズ

タイプ 説明
Image

プロットを含む画像。

例
exp = Explorer()
exp.create_embeddings_table()

similarity_idx_plot = exp.plot_similarity_index()
similarity_idx_plot.show() # view image preview
similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
ソースコード ultralytics/data/explorer/explorer.py
def plot_similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Image:
    """
    Plot the similarity index of all the images in the table. Here, the index will contain the data points that are
    max_dist or closer to the image in the embedding space at a given index.

    Args:
        max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
        top_k (float): Percentage of closest data points to consider when counting. Used to apply limit when
            running vector search. Defaults to 0.01.
        force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()

        similarity_idx_plot = exp.plot_similarity_index()
        similarity_idx_plot.show() # view image preview
        similarity_idx_plot.save('path/to/save/similarity_index_plot.png') # save contents to file
        ```
    """
    sim_idx = self.similarity_index(max_dist=max_dist, top_k=top_k, force=force)
    sim_count = sim_idx["count"].tolist()
    sim_count = np.array(sim_count)

    indices = np.arange(len(sim_count))

    # Create the bar plot
    plt.bar(indices, sim_count)

    # Customize the plot (optional)
    plt.xlabel("data idx")
    plt.ylabel("Count")
    plt.title("Similarity Count")
    buffer = BytesIO()
    plt.savefig(buffer, format="png")
    buffer.seek(0)

    # Use Pillow to open the image from the buffer
    return Image.fromarray(np.array(Image.open(buffer)))

plot_sql_query(query, labels=True)

SQLライクなクエリの結果をテーブルにプロットする。 引数 query (str):実行するSQLクエリ。 labels (bool):ラベルをプロットするかどうか。

リターンズ

タイプ 説明
Image

プロットを含む画像。

例
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.plot_sql_query(query)
ソースコード ultralytics/data/explorer/explorer.py
def plot_sql_query(self, query: str, labels: bool = True) -> Image.Image:
    """
    Plot the results of a SQL-Like query on the table.
    Args:
        query (str): SQL query to run.
        labels (bool): Whether to plot the labels or not.

    Returns:
        (PIL.Image): Image containing the plot.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
        result = exp.plot_sql_query(query)
        ```
    """
    result = self.sql_query(query, return_type="arrow")
    if len(result) == 0:
        LOGGER.info("No results found.")
        return None
    img = plot_query_result(result, plot_labels=labels)
    return Image.fromarray(img)

query(imgs=None, limit=25)

類似画像をテーブルに問い合わせる。単一の画像または画像のリストを受け付けます。

パラメーター

名称 タイプ 説明 デフォルト
imgs str or list

画像へのパス、または画像へのパスのリスト。

None
limit int

返す結果の数。

25

リターンズ

タイプ 説明
Table

結果を含むアローテーブル。への変換をサポートしています: - pandasデータフレーム: result.to_pandas() - リストのディクテーション: result.to_pydict()

例
exp = Explorer()
exp.create_embeddings_table()
similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
ソースコード ultralytics/data/explorer/explorer.py
def query(
    self, imgs: Union[str, np.ndarray, List[str], List[np.ndarray]] = None, limit: int = 25
) -> Any:  # pyarrow.Table
    """
    Query the table for similar images. Accepts a single image or a list of images.

    Args:
        imgs (str or list): Path to the image or a list of paths to the images.
        limit (int): Number of results to return.

    Returns:
        (pyarrow.Table): An arrow table containing the results. Supports converting to:
            - pandas dataframe: `result.to_pandas()`
            - dict of lists: `result.to_pydict()`

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        similar = exp.query(img='https://ultralytics.com/images/zidane.jpg')
        ```
    """
    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")
    if isinstance(imgs, str):
        imgs = [imgs]
    assert isinstance(imgs, list), f"img must be a string or a list of strings. Got {type(imgs)}"
    embeds = self.model.embed(imgs)
    # Get avg if multiple images are passed (len > 1)
    embeds = torch.mean(torch.stack(embeds), 0).cpu().numpy() if len(embeds) > 1 else embeds[0].cpu().numpy()
    return self.table.search(embeds).limit(limit).to_arrow()

similarity_index(max_dist=0.2, top_k=None, force=False)

表中のすべての画像の類似度指数を計算する。ここで、インデックスには は、与えられたインデックスにおいて、埋め込み空間内の画像に max_dist または近いデータ点を含む。

パラメーター

名称 タイプ 説明 デフォルト
max_dist float

考慮する埋め込み間の最大L2距離。デフォルトは0.2。

0.2
top_k float

カウント時に考慮する最も近いデータポイントのパーセンテージ。ベクトル検索を実行する際に ベクトル検索を実行する際に制限を適用するために使用される。デフォルト:なし。

None
force bool

既存の類似性インデックスを上書きするかどうか。デフォルトはTrue。

False

リターンズ

タイプ 説明
DataFrame

類似度インデックスを含むデータフレーム。各行が画像に対応し、列 には類似画像のインデックスとそれぞれの距離が含まれます。

例
exp = Explorer()
exp.create_embeddings_table()
sim_idx = exp.similarity_index()
ソースコード ultralytics/data/explorer/explorer.py
def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> DataFrame:
    """
    Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
    are max_dist or closer to the image in the embedding space at a given index.

    Args:
        max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
        top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit when running
                       vector search. Defaults: None.
        force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.

    Returns:
        (pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image, and columns
                            include indices of similar images and their respective distances.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        sim_idx = exp.similarity_index()
        ```
    """
    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")
    sim_idx_table_name = f"{self.sim_idx_base_name}_thres_{max_dist}_top_{top_k}".lower()
    if sim_idx_table_name in self.connection.table_names() and not force:
        LOGGER.info("Similarity matrix already exists. Reusing it. Pass force=True to overwrite it.")
        return self.connection.open_table(sim_idx_table_name).to_pandas()

    if top_k and not (1.0 >= top_k >= 0.0):
        raise ValueError(f"top_k must be between 0.0 and 1.0. Got {top_k}")
    if max_dist < 0.0:
        raise ValueError(f"max_dist must be greater than 0. Got {max_dist}")

    top_k = int(top_k * len(self.table)) if top_k else len(self.table)
    top_k = max(top_k, 1)
    features = self.table.to_lance().to_table(columns=["vector", "im_file"]).to_pydict()
    im_files = features["im_file"]
    embeddings = features["vector"]

    sim_table = self.connection.create_table(sim_idx_table_name, schema=get_sim_index_schema(), mode="overwrite")

    def _yield_sim_idx():
        """Generates a dataframe with similarity indices and distances for images."""
        for i in tqdm(range(len(embeddings))):
            sim_idx = self.table.search(embeddings[i]).limit(top_k).to_pandas().query(f"_distance <= {max_dist}")
            yield [
                {
                    "idx": i,
                    "im_file": im_files[i],
                    "count": len(sim_idx),
                    "sim_im_files": sim_idx["im_file"].tolist(),
                }
            ]

    sim_table.add(_yield_sim_idx())
    self.sim_index = sim_table
    return sim_table.to_pandas()

sql_query(query, return_type='pandas')

テーブルに対して SQL ライクなクエリを実行します。LanceDBの述語プッシュダウンを利用します。

パラメーター

名称 タイプ 説明 デフォルト
query str

実行するSQLクエリ。

必須
return_type str

返す結果の型。pandas'または'arrow'のいずれかを指定する。デフォルトは 'pandas' です。

'pandas'

リターンズ

タイプ 説明
Table

結果を含むアローテーブル。

例
exp = Explorer()
exp.create_embeddings_table()
query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
result = exp.sql_query(query)
ソースコード ultralytics/data/explorer/explorer.py
def sql_query(
    self, query: str, return_type: str = "pandas"
) -> Union[DataFrame, Any, None]:  # pandas.dataframe or pyarrow.Table
    """
    Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.

    Args:
        query (str): SQL query to run.
        return_type (str): Type of the result to return. Can be either 'pandas' or 'arrow'. Defaults to 'pandas'.

    Returns:
        (pyarrow.Table): An arrow table containing the results.

    Example:
        ```python
        exp = Explorer()
        exp.create_embeddings_table()
        query = "SELECT * FROM 'table' WHERE labels LIKE '%person%'"
        result = exp.sql_query(query)
        ```
    """
    assert return_type in {
        "pandas",
        "arrow",
    }, f"Return type should be either `pandas` or `arrow`, but got {return_type}"
    import duckdb

    if self.table is None:
        raise ValueError("Table is not created. Please create the table first.")

    # Note: using filter pushdown would be a better long term solution. Temporarily using duckdb for this.
    table = self.table.to_arrow()  # noqa NOTE: Don't comment this. This line is used by DuckDB
    if not query.startswith("SELECT") and not query.startswith("WHERE"):
        raise ValueError(
            f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
        )
    if query.startswith("WHERE"):
        query = f"SELECT * FROM 'table' {query}"
    LOGGER.info(f"Running query: {query}")

    rs = duckdb.sql(query)
    if return_type == "arrow":
        return rs.arrow()
    elif return_type == "pandas":
        return rs.df()

visualize(result)

クエリの結果を視覚化する。TODO。

パラメーター

名称 タイプ 説明 デフォルト
result Table

クエリの結果を含むテーブル。

必須
ソースコード ultralytics/data/explorer/explorer.py
def visualize(self, result):
    """
    Visualize the results of a query. TODO.

    Args:
        result (pyarrow.Table): Table containing the results of a query.
    """
    pass





作成日:2024-01-10 更新日:2024-01-10
作成者:glenn-jocher(1)

コメント