PoseAnything

Running

App Files Files Community

orhir commited on Dec 21, 2023

Commit

241adf2

1 Parent(s): 23abb8d

Upload 97 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +203 -0
Pose_Anything_Teaser.png +0 -0
README.md +145 -13
app.py +320 -0
configs/1shot-swin/base_split1_config.py +190 -0
configs/1shot-swin/base_split2_config.py +190 -0
configs/1shot-swin/base_split3_config.py +190 -0
configs/1shot-swin/base_split4_config.py +190 -0
configs/1shot-swin/base_split5_config.py +190 -0
configs/1shot-swin/graph_split1_config.py +191 -0
configs/1shot-swin/graph_split2_config.py +191 -0
configs/1shot-swin/graph_split3_config.py +191 -0
configs/1shot-swin/graph_split4_config.py +191 -0
configs/1shot-swin/graph_split5_config.py +191 -0
configs/1shots/base_split1_config.py +190 -0
configs/1shots/base_split2_config.py +190 -0
configs/1shots/base_split3_config.py +190 -0
configs/1shots/base_split4_config.py +190 -0
configs/1shots/base_split5_config.py +190 -0
configs/1shots/graph_split1_config.py +191 -0
configs/1shots/graph_split2_config.py +191 -0
configs/1shots/graph_split3_config.py +191 -0
configs/1shots/graph_split4_config.py +191 -0
configs/1shots/graph_split5_config.py +191 -0
configs/5shot-swin/base_split1_config.py +190 -0
configs/5shot-swin/base_split2_config.py +190 -0
configs/5shot-swin/base_split3_config.py +190 -0
configs/5shot-swin/base_split4_config.py +190 -0
configs/5shot-swin/base_split5_config.py +190 -0
configs/5shot-swin/graph_split1_config.py +191 -0
configs/5shot-swin/graph_split2_config.py +191 -0
configs/5shot-swin/graph_split3_config.py +191 -0
configs/5shot-swin/graph_split4_config.py +191 -0
configs/5shot-swin/graph_split5_config.py +191 -0
configs/5shots/base_split1_config.py +190 -0
configs/5shots/base_split2_config.py +190 -0
configs/5shots/base_split3_config.py +190 -0
configs/5shots/base_split4_config.py +190 -0
configs/5shots/base_split5_config.py +190 -0
configs/5shots/graph_split1_config.py +191 -0
configs/5shots/graph_split2_config.py +191 -0
configs/5shots/graph_split3_config.py +191 -0
configs/5shots/graph_split4_config.py +191 -0
configs/5shots/graph_split5_config.py +191 -0
configs/demo.py +194 -0
configs/demo_b.py +191 -0
demo.py +289 -0
docker/Dockerfile +50 -0
gradio_teaser.png +0 -0
models/VERSION +1 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+Copyright (c) 2022 SenseTime. All Rights Reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 MMClassification Authors.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Pose_Anything_Teaser.png ADDED Viewed

README.md CHANGED Viewed

@@ -1,13 +1,145 @@
----
-title: PoseAnything
-emoji: 🏢
-colorFrom: red
-colorTo: red
-sdk: gradio
-sdk_version: 4.11.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Pose Anything: A Graph-Based Approach for Category-Agnostic Pose Estimation
+<a href="https://orhir.github.io/pose-anything/"><img src="https://img.shields.io/static/v1?label=Project&message=Website&color=blue"></a>
+<a href="https://arxiv.org/abs/2311.17891"><img src="https://img.shields.io/badge/arXiv-2311.17891-b31b1b.svg"></a>
+<a href="https://www.apache.org/licenses/LICENSE-2.0.txt"><img src="https://img.shields.io/badge/License-Apache-yellow"></a>
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pose-anything-a-graph-based-approach-for/2d-pose-estimation-on-mp-100)](https://paperswithcode.com/sota/2d-pose-estimation-on-mp-100?p=pose-anything-a-graph-based-approach-for)
+By [Or Hirschorn](https://scholar.google.co.il/citations?user=GgFuT_QAAAAJ&hl=iw&oi=ao) and [Shai Avidan](https://scholar.google.co.il/citations?hl=iw&user=hpItE1QAAAAJ)
+This repo is the official implementation of "[Pose Anything: A Graph-Based Approach for Category-Agnostic Pose Estimation](https://arxiv.org/pdf/2311.17891.pdf)".
+<p align="center">
+<img src="Pose_Anything_Teaser.png" width="384">
+</p>
+## Introduction
+We present a novel approach to CAPE that leverages the inherent geometrical relations between keypoints through a newly designed Graph Transformer Decoder. By capturing and incorporating this crucial structural information, our method enhances the accuracy of keypoint localization, marking a significant departure from conventional CAPE techniques that treat keypoints as isolated entities.
+## Citation
+If you find this useful, please cite this work as follows:
+```bibtex
+@misc{hirschorn2023pose,
+      title={Pose Anything: A Graph-Based Approach for Category-Agnostic Pose Estimation},
+      author={Or Hirschorn and Shai Avidan},
+      year={2023},
+      eprint={2311.17891},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## Getting Started
+### Docker [Recommended]
+We provide a docker image for easy use.
+You can simply pull the docker image from docker hub, containing all the required libraries and packages:
+```
+docker pull orhir/pose_anything
+docker run --name pose_anything -v {DATA_DIR}:/workspace/PoseAnything/PoseAnything/data/mp100 -it orhir/pose_anything /bin/bash
+```
+### Conda Environment
+We train and evaluate our model on Python 3.8 and Pytorch 2.0.1 with CUDA 12.1.
+Please first install pytorch and torchvision following official documentation Pytorch.
+Then, follow [MMPose](https://mmpose.readthedocs.io/en/latest/installation.html) to install the following packages:
+```
+mmcv-full=1.6.2
+mmpose=0.29.0
+```
+Having installed these packages, run:
+```
+python setup.py develop
+```
+## Demo on Custom Images
+We provide a demo code to test our code on custom images.
+***A bigger and more accurate version of the model - COMING SOON!***
+### Gradio Demo
+We first require to install gradio:
+```
+pip install gradio==3.44.0
+```
+Then, Download the [pretrained model](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link) and run:
+```
+python app.py --checkpoint [path_to_pretrained_ckpt]
+```
+### Terminal Demo
+Download
+the [pretrained model](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link)
+and run:
+```
+python demo.py --support [path_to_support_image] --query [path_to_query_image] --config configs/demo_b.py --checkpoint [path_to_pretrained_ckpt]
+```
+***Note:*** The demo code supports any config with suitable checkpoint file. More pre-trained models can be found in the evaluation section.
+## MP-100 Dataset
+Please follow the [official guide](https://github.com/luminxu/Pose-for-Everything/blob/main/mp100/README.md) to prepare the MP-100 dataset for training and evaluation, and organize the data structure properly.
+We provide an updated annotation file, which includes skeleton definitions, in the following [link](https://drive.google.com/drive/folders/1uRyGB-P5Tc_6TmAZ6RnOi0SWjGq9b28T?usp=sharing).
+**Please note:**
+Current version of the MP-100 dataset includes some discrepancies and filenames errors:
+1. Note that the mentioned DeepFasion dataset is actually DeepFashion2 dataset. The link in the official repo is wrong. Use this [repo](https://github.com/switchablenorms/DeepFashion2/tree/master) instead.
+2. We provide a script to fix CarFusion filename errors, which can be run by:
+```
+python tools/fix_carfusion.py [path_to_CarFusion_dataset] [path_to_mp100_annotation]
+```
+## Training
+### Backbone Options
+To use pre-trained Swin-Transformer as used in our paper, we provide the weights, taken from this [repo](https://github.com/microsoft/Swin-Transformer/blob/main/MODELHUB.md), in the following [link](https://drive.google.com/drive/folders/1-q4mSxlNAUwDlevc3Hm5Ij0l_2OGkrcg?usp=sharing).
+These should be placed in the `./pretrained` folder.
+We also support DINO and ResNet backbones. To use them, you can easily change the config file to use the desired backbone.
+This can be done by changing the `pretrained` field in the config file to `dinov2`, `dino` or `resnet` respectively (this will automatically load the pretrained weights from the official repo).
+### Training
+To train the model, run:
+```
+python train.py --config [path_to_config_file]  --work-dir [path_to_work_dir]
+```
+## Evaluation and Pretrained Models
+You can download the pretrained checkpoints from following [link](https://drive.google.com/drive/folders/1RmrqzE3g0qYRD5xn54-aXEzrIkdYXpEW?usp=sharing).
+Here we provide the evaluation results of our pretrained models on MP-100 dataset along with the config files and checkpoints:
+### 1-Shot Models
+| Setting |                                                                       split 1                                                                       |                                                                       split 2                                                                       |                                                                       split 3                                                                       |                                                                       split 4                                                                       |                                                                       split 5                                                                       |
+|:-------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
+|  Tiny   |                                                                        91.06                                                                        |                                                                        88024                                                                        |                                                                        86.09                                                                        |                                                                        86.17                                                                        |                                                                        85.78                                                                        |
+|         |   [link](https://drive.google.com/file/d/1GubmkVkqybs-eD4hiRkgBzkUVGE_rIFX/view?usp=drive_link) / [config](configs/1shots/graph_split1_config.py)   |   [link](https://drive.google.com/file/d/1EEekDF3xV_wJOVk7sCQWUA8ygUKzEm2l/view?usp=drive_link) / [config](configs/1shots/graph_split2_config.py)   |   [link](https://drive.google.com/file/d/1FuwpNBdPI3mfSovta2fDGKoqJynEXPZQ/view?usp=drive_link) / [config](configs/1shots/graph_split3_config.py)   |   [link](https://drive.google.com/file/d/1_SSqSANuZlbC0utzIfzvZihAW9clefcR/view?usp=drive_link) / [config](configs/1shots/graph_split4_config.py)   |   [link](https://drive.google.com/file/d/1nUHr07W5F55u-FKQEPFq_CECgWZOKKLF/view?usp=drive_link) / [config](configs/1shots/graph_split5_config.py)   |
+|  Small  |                                                                        93.66                                                                        |                                                                        90.42                                                                        |                                                                        89.79                                                                        |                                                                        88.68                                                                        |                                                                        89.61                                                                        |
+|         | [link](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link) / [config](configs/1shot-swin/graph_split1_config.py) | [link](https://drive.google.com/file/d/1BT5b8MlnkflcdhTFiBROIQR3HccLsPQd/view?usp=drive_link) / [config](configs/1shot-swin/graph_split2_config.py) | [link](https://drive.google.com/file/d/1Z64cw_1CSDGObabSAWKnMK0BA_bqDHxn/view?usp=drive_link) / [config](configs/1shot-swin/graph_split3_config.py) | [link](https://drive.google.com/file/d/1vf82S8LAjIzpuBcbEoDCa26cR8DqNriy/view?usp=drive_link) / [config](configs/1shot-swin/graph_split4_config.py) | [link](https://drive.google.com/file/d/14FNx0JNbkS2CvXQMiuMU_kMZKFGO2rDV/view?usp=drive_link) / [config](configs/1shot-swin/graph_split5_config.py) |
+### 5-Shot Models
+| Setting |                                                                       split 1                                                                       |                                                                       split 2                                                                       |                                                                       split 3                                                                       |                                                                       split 4                                                                       |                                                                       split 5                                                                       |
+|:-------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
+|  Tiny   |                                                                        94.18                                                                        |                                                                        91.46                                                                        |                                                                        90.50                                                                        |                                                                        90.18                                                                        |                                                                        89.47                                                                        |
+|         |   [link](https://drive.google.com/file/d/1PeMuwv5YwiF3UCE5oN01Qchu5K3BaQ9L/view?usp=drive_link) / [config](configs/5shots/graph_split1_config.py)   |   [link](https://drive.google.com/file/d/1enIapPU1D8lZOET7q_qEjnhC1HFy3jWK/view?usp=drive_link) / [config](configs/5shots/graph_split2_config.py)   |   [link](https://drive.google.com/file/d/1MTeZ9Ba-ucLuqX0KBoLbBD5PaEct7VUp/view?usp=drive_link) / [config](configs/5shots/graph_split3_config.py)   |   [link](https://drive.google.com/file/d/1U2N7DI2F0v7NTnPCEEAgx-WKeBZNAFoa/view?usp=drive_link) / [config](configs/5shots/graph_split4_config.py)   |   [link](https://drive.google.com/file/d/1wapJDgtBWtmz61JNY7ktsFyvckRKiR2C/view?usp=drive_link) / [config](configs/5shots/graph_split5_config.py)   |
+|  Small  |                                                                        96.51                                                                        |                                                                        92.15                                                                        |                                                                        91.99                                                                        |                                                                        92.01                                                                        |                                                                        92.36                                                                        |
+|         | [link](https://drive.google.com/file/d/1p5rnA0MhmndSKEbyXMk49QXvNE03QV2p/view?usp=drive_link) / [config](configs/5shot-swin/graph_split1_config.py) | [link](https://drive.google.com/file/d/1Q3KNyUW_Gp3JytYxUPhkvXFiDYF6Hv8w/view?usp=drive_link) / [config](configs/5shot-swin/graph_split2_config.py) | [link](https://drive.google.com/file/d/1gWgTk720fSdAf_ze1FkfXTW0t7k-69dV/view?usp=drive_link) / [config](configs/5shot-swin/graph_split3_config.py) | [link](https://drive.google.com/file/d/1LuaRQ8a6AUPrkr7l5j2W6Fe_QbgASkwY/view?usp=drive_link) / [config](configs/5shot-swin/graph_split4_config.py) | [link](https://drive.google.com/file/d/1z--MAOPCwMG_GQXru9h2EStbnIvtHv1L/view?usp=drive_link) / [config](configs/5shot-swin/graph_split5_config.py) |
+### Evaluation
+The evaluation on a single GPU will take approximately 30 min.
+To evaluate the pretrained model, run:
+```
+python test.py [path_to_config_file] [path_to_pretrained_ckpt]
+```
+## Acknowledgement
+Our code is based on code from:
+ - [MMPose](https://github.com/open-mmlab/mmpose)
+ - [CapeFormer](https://github.com/flyinglynx/CapeFormer)
+## License
+This project is released under the Apache 2.0 license.

app.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import argparse
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+# os.system('python -m pip install timm')
+# os.system('python -m pip install -U openxlab')
+# os.system('python -m pip install -U pillow')
+# os.system('python -m pip install Openmim')
+# os.system('python -m mim install mmengine')
+os.system('python -m mim install "mmcv-full==1.6.2"')
+os.system('python -m mim install "mmpose==0.29.0"')
+os.system('python -m mim install "gradio==3.44.0"')
+os.system('python setup.py develop')
+import gradio as gr
+import numpy as np
+import torch
+from PIL import ImageDraw, Image
+from matplotlib import pyplot as plt
+from mmcv import Config
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+from demo import Resize_Pad
+from models import *
+import matplotlib
+matplotlib.use('agg')
+def plot_results(support_img, query_img, support_kp, support_w, query_kp,
+                 query_w, skeleton,
+                 initial_proposals, prediction, radius=6):
+    h, w, c = support_img.shape
+    prediction = prediction[-1].cpu().numpy() * h
+    query_img = (query_img - np.min(query_img)) / (
+            np.max(query_img) - np.min(query_img))
+    for id, (img, w, keypoint) in enumerate(zip([query_img],
+                                                [query_w],
+                                                [prediction])):
+        f, axes = plt.subplots()
+        plt.imshow(img)
+        for k in range(keypoint.shape[0]):
+            if w[k] > 0:
+                kp = keypoint[k, :2]
+                c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
+                patch = plt.Circle(kp, radius, color=c)
+                axes.add_patch(patch)
+                axes.text(kp[0], kp[1], k)
+                plt.draw()
+        for l, limb in enumerate(skeleton):
+            kp = keypoint[:, :2]
+            if l > len(COLORS) - 1:
+                c = [x / 255 for x in random.sample(range(0, 255), 3)]
+            else:
+                c = [x / 255 for x in COLORS[l]]
+            if w[limb[0]] > 0 and w[limb[1]] > 0:
+                patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
+                                   [kp[limb[0], 1], kp[limb[1], 1]],
+                                   linewidth=6, color=c, alpha=0.6)
+                axes.add_artist(patch)
+        plt.axis('off')  # command for hiding the axis.
+        plt.subplots_adjust(0, 0, 1, 1, 0, 0)
+        return plt
+COLORS = [
+    [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]
+]
+kp_src = []
+skeleton = []
+count = 0
+color_idx = 0
+prev_pt = None
+prev_pt_idx = None
+prev_clicked = None
+original_support_image = None
+checkpoint_path = ''
+def process(query_img,
+            cfg_path='configs/demo_b.py'):
+    global skeleton
+    cfg = Config.fromfile(cfg_path)
+    kp_src_np = np.array(kp_src).copy().astype(np.float32)
+    kp_src_np[:, 0] = kp_src_np[:, 0] / 128. * cfg.model.encoder_config.img_size
+    kp_src_np[:, 1] = kp_src_np[:, 1] / 128. * cfg.model.encoder_config.img_size
+    kp_src_np = np.flip(kp_src_np, 1).copy()
+    kp_src_tensor = torch.tensor(kp_src_np).float()
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size,
+                   cfg.model.encoder_config.img_size)])
+    if len(skeleton) == 0:
+        skeleton = [(0, 0)]
+    support_img = preprocess(original_support_image).flip(0)[None]
+    np_query = np.array(query_img)[:, :, ::-1].copy()
+    q_img = preprocess(np_query).flip(0)[None]
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
+                                       cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.concatenate(
+        (kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.concatenate(
+        (torch.ones_like(kp_src_tensor),
+         torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
+                                                                 kp_src_3d,
+                                                                 kp_src_3d_weight,
+                                                                 sigma=1)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.ones_like(
+        torch.tensor(target_weight_s).float()[None])
+    data = {
+        'img_s': [support_img],
+        'img_q': q_img,
+        'target_s': [target_s],
+        'target_weight_s': [target_weight_s],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [skeleton],
+                       'query_skeleton': skeleton,
+                       'sample_joints_3d': [kp_src_3d],
+                       'query_joints_3d': kp_src_3d,
+                       'sample_center': [kp_src_tensor.mean(dim=0)],
+                       'query_center': kp_src_tensor.mean(dim=0),
+                       'sample_scale': [
+                           kp_src_tensor.max(dim=0)[0] -
+                           kp_src_tensor.min(dim=0)[0]],
+                       'query_scale': kp_src_tensor.max(dim=0)[0] -
+                                      kp_src_tensor.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+    # Load model
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, checkpoint_path, map_location='cpu')
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**data)
+    # visualize results
+    vis_s_weight = target_weight_s[0]
+    vis_q_weight = target_weight_s[0]
+    vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    support_kp = kp_src_3d
+    out = plot_results(vis_s_image,
+                       vis_q_image,
+                       support_kp,
+                       vis_s_weight,
+                       None,
+                       vis_q_weight,
+                       skeleton,
+                       None,
+                       torch.tensor(outputs['points']).squeeze(0),
+                       )
+    return out
+with gr.Blocks() as demo:
+    gr.Markdown('''
+    # Pose Anything Demo
+    We present a novel approach to category agnostic pose estimation that leverages the inherent geometrical relations between keypoints through a newly designed Graph Transformer Decoder. By capturing and incorporating this crucial structural information, our method enhances the accuracy of keypoint localization, marking a significant departure from conventional CAPE techniques that treat keypoints as isolated entities.
+    ### [Paper](https://arxiv.org/abs/2311.17891) | [Official Repo](https://github.com/orhir/PoseAnything)
+    ![](/file=gradio_teaser.png)
+    ## Instructions
+    1. Upload an image of the object you want to pose on the **left** image.
+    2. Click on the **left** image to mark keypoints.
+    3. Click on the keypoints on the **right** image to mark limbs.
+    4. Upload an image of the object you want to pose to the query image (**bottom**).
+    5. Click **Evaluate** to pose the query image.
+    ''')
+    with gr.Row():
+        support_img = gr.Image(label="Support Image",
+                               type="pil",
+                               info='Click to mark keypoints').style(
+            height=256, width=256)
+        posed_support = gr.Image(label="Posed Support Image",
+                                 type="pil",
+                                 interactive=False).style(height=256, width=256)
+    with gr.Row():
+        query_img = gr.Image(label="Query Image",
+                             type="pil").style(height=256, width=256)
+    with gr.Row():
+        eval_btn = gr.Button(value="Evaluate")
+    with gr.Row():
+        output_img = gr.Plot(label="Output Image", height=256, width=256)
+    def get_select_coords(kp_support,
+                          limb_support,
+                          evt: gr.SelectData,
+                          r=0.015):
+        pixels_in_queue = set()
+        pixels_in_queue.add((evt.index[1], evt.index[0]))
+        while len(pixels_in_queue) > 0:
+            pixel = pixels_in_queue.pop()
+            if pixel[0] is not None and pixel[
+                1] is not None and pixel not in kp_src:
+                kp_src.append(pixel)
+            else:
+                print("Invalid pixel")
+            if limb_support is None:
+                canvas_limb = kp_support
+            else:
+                canvas_limb = limb_support
+            canvas_kp = kp_support
+            w, h = canvas_kp.size
+            draw_pose = ImageDraw.Draw(canvas_kp)
+            draw_limb = ImageDraw.Draw(canvas_limb)
+            r = int(r * w)
+            leftUpPoint = (pixel[1] - r, pixel[0] - r)
+            rightDownPoint = (pixel[1] + r, pixel[0] + r)
+            twoPointList = [leftUpPoint, rightDownPoint]
+            draw_pose.ellipse(twoPointList, fill=(255, 0, 0, 255))
+            draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
+        return canvas_kp, canvas_limb
+    def get_limbs(kp_support,
+                  evt: gr.SelectData,
+                  r=0.02, width=0.02):
+        global count, color_idx, prev_pt, skeleton, prev_pt_idx, prev_clicked
+        curr_pixel = (evt.index[1], evt.index[0])
+        pixels_in_queue = set()
+        pixels_in_queue.add((evt.index[1], evt.index[0]))
+        canvas_kp = kp_support
+        w, h = canvas_kp.size
+        r = int(r * w)
+        width = int(width * w)
+        while (len(pixels_in_queue) > 0 and
+               curr_pixel != prev_clicked and
+               len(kp_src) > 0):
+            pixel = pixels_in_queue.pop()
+            prev_clicked = pixel
+            closest_point = min(kp_src,
+                                key=lambda p: (p[0] - pixel[0]) ** 2 +
+                                              (p[1] - pixel[1]) ** 2)
+            closest_point_index = kp_src.index(closest_point)
+            draw_limb = ImageDraw.Draw(canvas_kp)
+            if color_idx < len(COLORS):
+                c = COLORS[color_idx]
+            else:
+                c = random.choices(range(256), k=3)
+            leftUpPoint = (closest_point[1] - r, closest_point[0] - r)
+            rightDownPoint = (closest_point[1] + r, closest_point[0] + r)
+            twoPointList = [leftUpPoint, rightDownPoint]
+            draw_limb.ellipse(twoPointList, fill=tuple(c))
+            if count == 0:
+                prev_pt = closest_point[1], closest_point[0]
+                prev_pt_idx = closest_point_index
+                count = count + 1
+            else:
+                if prev_pt_idx != closest_point_index:
+                    # Create Line and add Limb
+                    draw_limb.line([prev_pt, (closest_point[1], closest_point[0])],
+                                   fill=tuple(c),
+                                   width=width)
+                    skeleton.append((prev_pt_idx, closest_point_index))
+                    color_idx = color_idx + 1
+                else:
+                    draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
+                count = 0
+        return canvas_kp
+    def set_query(support_img):
+        global original_support_image
+        skeleton.clear()
+        kp_src.clear()
+        original_support_image = np.array(support_img)[:, :, ::-1].copy()
+        support_img = support_img.resize((128, 128), Image.Resampling.LANCZOS)
+        return support_img, support_img
+    support_img.select(get_select_coords,
+                       [support_img, posed_support],
+                       [support_img, posed_support],
+                       )
+    support_img.upload(set_query,
+                       inputs=support_img,
+                       outputs=[support_img,posed_support])
+    posed_support.select(get_limbs,
+                         posed_support,
+                         posed_support)
+    eval_btn.click(fn=process,
+                   inputs=[query_img],
+                   outputs=output_img)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Pose Anything Demo')
+    parser.add_argument('--checkpoint',
+                        help='checkpoint path',
+                        default='https://huggingface.co/orhir/PoseAnything/blob/main/1shot-swin_graph_split1.pth')
+    args = parser.parse_args()
+    checkpoint_path = args.checkpoint
+    demo.launch()

configs/1shot-swin/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shots/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shot-swin/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_base_22k_500k.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,190 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/5shots/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/demo.py ADDED Viewed

	@@ -0,0 +1,194 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='TransformerPoseTwoStage',
+    pretrained='swinv2_large',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=16,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.2,
+        img_size=256,
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=1536,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=384,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=1536,
+            dropout=0.1,
+            similarity_proj_dim=384,
+            dynamic_proj_dim=192,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        support_pos_embed=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=0.02,
+        num_samples=0,
+        support_embedding_type="fixed",
+        num_support=100,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=192, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_all.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/demo_b.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='swinv2_base',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=14,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        drop_path_rate=0.1,
+        img_size=224,
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=1024,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=1024,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[224, 224],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

demo.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import argparse
+import copy
+import os
+import pickle
+import random
+import cv2
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+from models import *
+import torchvision.transforms.functional as F
+from tools.visualization import plot_results
+COLORS = [
+    [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
+class Resize_Pad:
+    def __init__(self, w=256, h=256):
+        self.w = w
+        self.h = h
+    def __call__(self, image):
+        _, w_1, h_1 = image.shape
+        ratio_1 = w_1 / h_1
+        # check if the original and final aspect ratios are the same within a margin
+        if round(ratio_1, 2) != 1:
+            # padding to preserve aspect ratio
+            if ratio_1 > 1:  # Make the image higher
+                hp = int(w_1 - h_1)
+                hp = hp // 2
+                image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+            else:
+                wp = int(h_1 - w_1)
+                wp = wp // 2
+                image = F.pad(image, (0, wp, 0, wp), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+        else:
+            return F.resize(image, [self.h, self.w])
+def transform_keypoints_to_pad_and_resize(keypoints, image_size):
+    trans_keypoints = keypoints.clone()
+    h, w = image_size[:2]
+    ratio_1 = w / h
+    if ratio_1 > 1:
+        # width is bigger than height - pad height
+        hp = int(w - h)
+        hp = hp // 2
+        trans_keypoints[:, 1] = keypoints[:, 1] + hp
+        trans_keypoints *= (256. / w)
+    else:
+        # height is bigger than width - pad width
+        wp = int(image_size[1] - image_size[0])
+        wp = wp // 2
+        trans_keypoints[:, 0] = keypoints[:, 0] + wp
+        trans_keypoints *= (256. / h)
+    return trans_keypoints
+def parse_args():
+    parser = argparse.ArgumentParser(description='Pose Anything Demo')
+    parser.add_argument('--support', help='Image file')
+    parser.add_argument('--query', help='Image file')
+    parser.add_argument('--config', default=None, help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--outdir', default='output', help='checkpoint file')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+             'the inference speed')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+             'in xxx=yyy format will be merged into config file. For example, '
+             "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+def main():
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.data.test.test_mode = True
+    os.makedirs(args.outdir, exist_ok=True)
+    # Load data
+    support_img = cv2.imread(args.support)
+    query_img = cv2.imread(args.query)
+    if support_img is None or query_img is None:
+        raise ValueError('Fail to read images')
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
+    # frame = copy.deepcopy(support_img)
+    padded_support_img = preprocess(support_img).cpu().numpy().transpose(1, 2, 0) * 255
+    frame = copy.deepcopy(padded_support_img.astype(np.uint8).copy())
+    kp_src = []
+    skeleton = []
+    count = 0
+    prev_pt = None
+    prev_pt_idx = None
+    color_idx = 0
+    def selectKP(event, x, y, flags, param):
+        nonlocal kp_src, frame
+        # if we are in points selection mode, the mouse was clicked,
+        # list of  points with the (x, y) location of the click
+        # and draw the circle
+        if event == cv2.EVENT_LBUTTONDOWN:
+            kp_src.append((x, y))
+            cv2.circle(frame, (x, y), 2, (0, 0, 255), 1)
+            cv2.imshow("Source", frame)
+        if event == cv2.EVENT_RBUTTONDOWN:
+            kp_src = []
+            frame = copy.deepcopy(support_img)
+            cv2.imshow("Source", frame)
+    def draw_line(event, x, y, flags, param):
+        nonlocal skeleton, kp_src, frame, count, prev_pt, prev_pt_idx, marked_frame, color_idx
+        if event == cv2.EVENT_LBUTTONDOWN:
+            closest_point = min(kp_src, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2)
+            closest_point_index = kp_src.index(closest_point)
+            if color_idx < len(COLORS):
+                c = COLORS[color_idx]
+            else:
+                c = random.choices(range(256), k=3)
+            color = color_idx
+            cv2.circle(frame, closest_point, 2, c, 1)
+            if count == 0:
+                prev_pt = closest_point
+                prev_pt_idx = closest_point_index
+                count = count + 1
+                cv2.imshow("Source", frame)
+            else:
+                cv2.line(frame, prev_pt, closest_point, c, 2)
+                cv2.imshow("Source", frame)
+                count = 0
+                skeleton.append((prev_pt_idx, closest_point_index))
+                color_idx = color_idx + 1
+        elif event == cv2.EVENT_RBUTTONDOWN:
+            frame = copy.deepcopy(marked_frame)
+            cv2.imshow("Source", frame)
+            count = 0
+            color_idx = 0
+            skeleton = []
+            prev_pt = None
+    cv2.namedWindow("Source", cv2.WINDOW_NORMAL)
+    cv2.resizeWindow('Source', 800, 600)
+    cv2.setMouseCallback("Source", selectKP)
+    cv2.imshow("Source", frame)
+    # keep looping until points have been selected
+    print('Press any key when finished marking the points!! ')
+    while True:
+        if cv2.waitKey(1) > 0:
+            break
+    marked_frame = copy.deepcopy(frame)
+    cv2.setMouseCallback("Source", draw_line)
+    print('Press any key when finished creating skeleton!!')
+    while True:
+        if cv2.waitKey(1) > 0:
+            break
+    cv2.destroyAllWindows()
+    kp_src = torch.tensor(kp_src).float()
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
+    if len(skeleton) == 0:
+        skeleton = [(0, 0)]
+    support_img = preprocess(support_img).flip(0)[None]
+    query_img = preprocess(query_img).flip(0)[None]
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=1)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.tensor(target_weight_s).float()[None]
+    data = {
+        'img_s': [support_img],
+        'img_q': query_img,
+        'target_s': [target_s],
+        'target_weight_s': [target_weight_s],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [skeleton],
+                       'query_skeleton': skeleton,
+                       'sample_joints_3d': [kp_src_3d],
+                       'query_joints_3d': kp_src_3d,
+                       'sample_center': [kp_src.mean(dim=0)],
+                       'query_center': kp_src.mean(dim=0),
+                       'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
+                       'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+    # Load model
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**data)
+    # visualize results
+    vis_s_weight = target_weight_s[0]
+    vis_q_weight = target_weight_s[0]
+    vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    support_kp = kp_src_3d
+    plot_results(vis_s_image,
+                 vis_q_image,
+                 support_kp,
+                 vis_s_weight,
+                 None,
+                 vis_q_weight,
+                 skeleton,
+                 None,
+                 torch.tensor(outputs['points']).squeeze(0),
+                 out_dir=args.outdir)
+if __name__ == '__main__':
+    main()

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+ARG PYTORCH="2.0.1"
+ARG CUDA="11.7"
+ARG CUDNN="8"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+ENV TZ=Asia/Kolkata DEBIAN_FRONTEND=noninteractive
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install xtcocotools
+RUN pip install cython
+RUN pip install xtcocotools
+# Install MMEngine and MMCV
+RUN pip install openmim
+RUN mim install mmengine
+RUN mim install "mmpose==0.28.1"
+RUN mim install "mmcv-full==1.5.3"
+RUN pip install -U torchmetrics timm
+RUN pip install numpy scipy --upgrade
+RUN pip install future tensorboard
+WORKDIR PoseAnything
+COPY models PoseAnything/models
+COPY configs PoseAnything/configs
+COPY pretrained PoseAnything/pretrained
+COPY requirements.txt PoseAnything/
+COPY tools PoseAnything/tools
+COPY setup.cfg PoseAnything/
+COPY setup.py PoseAnything/
+COPY test.py PoseAnything/
+COPY train.py PoseAnything/
+COPY README.md PoseAnything/
+RUN mkdir -p PoseAnything/data/mp100
+WORKDIR PoseAnything
+# Install MMPose
+RUN conda clean --all
+ENV FORCE_CUDA="1"
+RUN python setup.py develop

gradio_teaser.png ADDED Viewed

models/VERSION ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.2.0