Upload Vision_Releases_transformers_4_56.ipynb
Browse files
Vision_Releases_transformers_4_56.ipynb
CHANGED
|
@@ -1737,7 +1737,7 @@
|
|
| 1737 |
"\n",
|
| 1738 |
"New transformers release comes with amazing vision/multimodal models: Florence-2 by MSFT, SAM-2 by Meta, KOSMOS-2.5 by MSFT, MetaCLIP2 by Meta, all runnable in Colab free tier. This notebook enables you to try them all!\n",
|
| 1739 |
"\n",
|
| 1740 |
-
"Note: This notebook has a lot of image outputs, so you need to run the notebook to see them."
|
| 1741 |
],
|
| 1742 |
"metadata": {
|
| 1743 |
"id": "zwCKwR_TkwLy"
|
|
@@ -1809,7 +1809,7 @@
|
|
| 1809 |
"metadata": {
|
| 1810 |
"id": "JudF0LvsgInQ"
|
| 1811 |
},
|
| 1812 |
-
"execution_count":
|
| 1813 |
"outputs": []
|
| 1814 |
},
|
| 1815 |
{
|
|
@@ -1824,7 +1824,7 @@
|
|
| 1824 |
"metadata": {
|
| 1825 |
"id": "s3oe_JZ0hiY5"
|
| 1826 |
},
|
| 1827 |
-
"execution_count":
|
| 1828 |
"outputs": []
|
| 1829 |
},
|
| 1830 |
{
|
|
@@ -1841,7 +1841,7 @@
|
|
| 1841 |
},
|
| 1842 |
"outputId": "750f0535-6804-403e-98d7-da390d4a1be3"
|
| 1843 |
},
|
| 1844 |
-
"execution_count":
|
| 1845 |
"outputs": [
|
| 1846 |
{
|
| 1847 |
"output_type": "stream",
|
|
@@ -1877,7 +1877,7 @@
|
|
| 1877 |
"metadata": {
|
| 1878 |
"id": "FndxHptOinaC"
|
| 1879 |
},
|
| 1880 |
-
"execution_count":
|
| 1881 |
"outputs": []
|
| 1882 |
},
|
| 1883 |
{
|
|
@@ -1894,7 +1894,7 @@
|
|
| 1894 |
},
|
| 1895 |
"outputId": "de6c2001-caef-4184-8430-448815eabbb3"
|
| 1896 |
},
|
| 1897 |
-
"execution_count":
|
| 1898 |
"outputs": [
|
| 1899 |
{
|
| 1900 |
"output_type": "stream",
|
|
@@ -1978,7 +1978,7 @@
|
|
| 1978 |
"source": [
|
| 1979 |
"## Kosmos 2.5\n",
|
| 1980 |
"\n",
|
| 1981 |
-
"Kosmos 2.5 by Microsoft is a great document model that can not only convert documents to markdown, it also can locate meaningful structures on documents.\n",
|
| 1982 |
"It has a \"normal\" checkpoint and a \"chat\" checkpoint which can be used for VQA tasks. Let's see how to use it."
|
| 1983 |
],
|
| 1984 |
"metadata": {
|
|
@@ -2012,7 +2012,7 @@
|
|
| 2012 |
"metadata": {
|
| 2013 |
"id": "tPSqn-POl4up"
|
| 2014 |
},
|
| 2015 |
-
"execution_count":
|
| 2016 |
"outputs": []
|
| 2017 |
},
|
| 2018 |
{
|
|
@@ -2054,7 +2054,7 @@
|
|
| 2054 |
},
|
| 2055 |
"outputId": "0bdc32d0-3897-46ef-f897-b2208e1cc28d"
|
| 2056 |
},
|
| 2057 |
-
"execution_count":
|
| 2058 |
"outputs": [
|
| 2059 |
{
|
| 2060 |
"output_type": "stream",
|
|
@@ -2157,7 +2157,7 @@
|
|
| 2157 |
},
|
| 2158 |
"outputId": "58d36158-c08c-4faf-897c-f9a83ce02760"
|
| 2159 |
},
|
| 2160 |
-
"execution_count":
|
| 2161 |
"outputs": [
|
| 2162 |
{
|
| 2163 |
"output_type": "stream",
|
|
@@ -2208,7 +2208,7 @@
|
|
| 2208 |
"metadata": {
|
| 2209 |
"id": "VbHH5RQ2qzlo"
|
| 2210 |
},
|
| 2211 |
-
"execution_count":
|
| 2212 |
"outputs": []
|
| 2213 |
},
|
| 2214 |
{
|
|
@@ -2221,7 +2221,7 @@
|
|
| 2221 |
"metadata": {
|
| 2222 |
"id": "XjCTXKWCmWdI"
|
| 2223 |
},
|
| 2224 |
-
"execution_count":
|
| 2225 |
"outputs": []
|
| 2226 |
},
|
| 2227 |
{
|
|
@@ -2249,7 +2249,7 @@
|
|
| 2249 |
},
|
| 2250 |
"outputId": "bab3453d-5846-4518-be23-92cdc0d9e5a1"
|
| 2251 |
},
|
| 2252 |
-
"execution_count":
|
| 2253 |
"outputs": [
|
| 2254 |
{
|
| 2255 |
"output_type": "stream",
|
|
@@ -2345,7 +2345,7 @@
|
|
| 2345 |
},
|
| 2346 |
"outputId": "3f012250-0a09-492e-d675-3950ff447e5f"
|
| 2347 |
},
|
| 2348 |
-
"execution_count":
|
| 2349 |
"outputs": [
|
| 2350 |
{
|
| 2351 |
"output_type": "display_data",
|
|
@@ -2452,7 +2452,7 @@
|
|
| 2452 |
"metadata": {
|
| 2453 |
"id": "kxbraMeuvMni"
|
| 2454 |
},
|
| 2455 |
-
"execution_count":
|
| 2456 |
"outputs": []
|
| 2457 |
},
|
| 2458 |
{
|
|
@@ -2482,7 +2482,7 @@
|
|
| 2482 |
"metadata": {
|
| 2483 |
"id": "LZ1fD_VTvLdV"
|
| 2484 |
},
|
| 2485 |
-
"execution_count":
|
| 2486 |
"outputs": []
|
| 2487 |
},
|
| 2488 |
{
|
|
@@ -2506,7 +2506,7 @@
|
|
| 2506 |
"id": "0mRS1uGWPieQ",
|
| 2507 |
"outputId": "abb87538-5a2f-4be8-9a7e-fa860b872321"
|
| 2508 |
},
|
| 2509 |
-
"execution_count":
|
| 2510 |
"outputs": [
|
| 2511 |
{
|
| 2512 |
"output_type": "execute_result",
|
|
@@ -2606,7 +2606,7 @@
|
|
| 2606 |
"id": "n1FrIXFWRi_d",
|
| 2607 |
"outputId": "d3b3bc0c-058d-46bd-a1fb-df128f81acef"
|
| 2608 |
},
|
| 2609 |
-
"execution_count":
|
| 2610 |
"outputs": [
|
| 2611 |
{
|
| 2612 |
"output_type": "stream",
|
|
@@ -2648,7 +2648,7 @@
|
|
| 2648 |
"metadata": {
|
| 2649 |
"id": "ZF-ANJiLRTjE"
|
| 2650 |
},
|
| 2651 |
-
"execution_count":
|
| 2652 |
"outputs": []
|
| 2653 |
},
|
| 2654 |
{
|
|
@@ -2683,7 +2683,7 @@
|
|
| 2683 |
"metadata": {
|
| 2684 |
"id": "6HZUQjytSBfd"
|
| 2685 |
},
|
| 2686 |
-
"execution_count":
|
| 2687 |
"outputs": []
|
| 2688 |
},
|
| 2689 |
{
|
|
@@ -2706,7 +2706,7 @@
|
|
| 2706 |
"metadata": {
|
| 2707 |
"id": "RdAy8fwQSSSF"
|
| 2708 |
},
|
| 2709 |
-
"execution_count":
|
| 2710 |
"outputs": []
|
| 2711 |
},
|
| 2712 |
{
|
|
@@ -2717,7 +2717,7 @@
|
|
| 2717 |
"metadata": {
|
| 2718 |
"id": "132QpoZDScgw"
|
| 2719 |
},
|
| 2720 |
-
"execution_count":
|
| 2721 |
"outputs": []
|
| 2722 |
},
|
| 2723 |
{
|
|
@@ -2750,7 +2750,7 @@
|
|
| 2750 |
"metadata": {
|
| 2751 |
"id": "6V5JQ7pqS4GG"
|
| 2752 |
},
|
| 2753 |
-
"execution_count":
|
| 2754 |
"outputs": []
|
| 2755 |
},
|
| 2756 |
{
|
|
@@ -2772,7 +2772,7 @@
|
|
| 2772 |
"id": "fhhtCRo2S6hR",
|
| 2773 |
"outputId": "070f7b3e-48c0-46a1-f3cb-115a9e77fa97"
|
| 2774 |
},
|
| 2775 |
-
"execution_count":
|
| 2776 |
"outputs": [
|
| 2777 |
{
|
| 2778 |
"output_type": "stream",
|
|
@@ -2855,9 +2855,9 @@
|
|
| 2855 |
"base_uri": "https://localhost:8080/"
|
| 2856 |
},
|
| 2857 |
"id": "eX-_Cl9jVMQA",
|
| 2858 |
-
"outputId": "
|
| 2859 |
},
|
| 2860 |
-
"execution_count":
|
| 2861 |
"outputs": [
|
| 2862 |
{
|
| 2863 |
"output_type": "execute_result",
|
|
@@ -2873,7 +2873,7 @@
|
|
| 2873 |
]
|
| 2874 |
},
|
| 2875 |
"metadata": {},
|
| 2876 |
-
"execution_count":
|
| 2877 |
}
|
| 2878 |
]
|
| 2879 |
},
|
|
@@ -2902,6 +2902,21 @@
|
|
| 2902 |
},
|
| 2903 |
"execution_count": null,
|
| 2904 |
"outputs": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2905 |
}
|
| 2906 |
]
|
| 2907 |
}
|
|
|
|
| 1737 |
"\n",
|
| 1738 |
"New transformers release comes with amazing vision/multimodal models: Florence-2 by MSFT, SAM-2 by Meta, KOSMOS-2.5 by MSFT, MetaCLIP2 by Meta, all runnable in Colab free tier. This notebook enables you to try them all!\n",
|
| 1739 |
"\n",
|
| 1740 |
+
"Note: This notebook has a lot of image outputs, so you need to run the notebook to see them. There's links at the end for each model's documentation, check them out for more inference options & info!"
|
| 1741 |
],
|
| 1742 |
"metadata": {
|
| 1743 |
"id": "zwCKwR_TkwLy"
|
|
|
|
| 1809 |
"metadata": {
|
| 1810 |
"id": "JudF0LvsgInQ"
|
| 1811 |
},
|
| 1812 |
+
"execution_count": null,
|
| 1813 |
"outputs": []
|
| 1814 |
},
|
| 1815 |
{
|
|
|
|
| 1824 |
"metadata": {
|
| 1825 |
"id": "s3oe_JZ0hiY5"
|
| 1826 |
},
|
| 1827 |
+
"execution_count": null,
|
| 1828 |
"outputs": []
|
| 1829 |
},
|
| 1830 |
{
|
|
|
|
| 1841 |
},
|
| 1842 |
"outputId": "750f0535-6804-403e-98d7-da390d4a1be3"
|
| 1843 |
},
|
| 1844 |
+
"execution_count": null,
|
| 1845 |
"outputs": [
|
| 1846 |
{
|
| 1847 |
"output_type": "stream",
|
|
|
|
| 1877 |
"metadata": {
|
| 1878 |
"id": "FndxHptOinaC"
|
| 1879 |
},
|
| 1880 |
+
"execution_count": null,
|
| 1881 |
"outputs": []
|
| 1882 |
},
|
| 1883 |
{
|
|
|
|
| 1894 |
},
|
| 1895 |
"outputId": "de6c2001-caef-4184-8430-448815eabbb3"
|
| 1896 |
},
|
| 1897 |
+
"execution_count": null,
|
| 1898 |
"outputs": [
|
| 1899 |
{
|
| 1900 |
"output_type": "stream",
|
|
|
|
| 1978 |
"source": [
|
| 1979 |
"## Kosmos 2.5\n",
|
| 1980 |
"\n",
|
| 1981 |
+
"Kosmos 2.5 by Microsoft is a great document model that can not only convert documents to markdown, it also can locate meaningful structures on documents and indicate parts of documents with bounding boxes. You can try [this demo](https://huggingface.co/spaces/nielsr/kosmos-2.5-demo) to see what it can do.\n",
|
| 1982 |
"It has a \"normal\" checkpoint and a \"chat\" checkpoint which can be used for VQA tasks. Let's see how to use it."
|
| 1983 |
],
|
| 1984 |
"metadata": {
|
|
|
|
| 2012 |
"metadata": {
|
| 2013 |
"id": "tPSqn-POl4up"
|
| 2014 |
},
|
| 2015 |
+
"execution_count": null,
|
| 2016 |
"outputs": []
|
| 2017 |
},
|
| 2018 |
{
|
|
|
|
| 2054 |
},
|
| 2055 |
"outputId": "0bdc32d0-3897-46ef-f897-b2208e1cc28d"
|
| 2056 |
},
|
| 2057 |
+
"execution_count": null,
|
| 2058 |
"outputs": [
|
| 2059 |
{
|
| 2060 |
"output_type": "stream",
|
|
|
|
| 2157 |
},
|
| 2158 |
"outputId": "58d36158-c08c-4faf-897c-f9a83ce02760"
|
| 2159 |
},
|
| 2160 |
+
"execution_count": null,
|
| 2161 |
"outputs": [
|
| 2162 |
{
|
| 2163 |
"output_type": "stream",
|
|
|
|
| 2208 |
"metadata": {
|
| 2209 |
"id": "VbHH5RQ2qzlo"
|
| 2210 |
},
|
| 2211 |
+
"execution_count": null,
|
| 2212 |
"outputs": []
|
| 2213 |
},
|
| 2214 |
{
|
|
|
|
| 2221 |
"metadata": {
|
| 2222 |
"id": "XjCTXKWCmWdI"
|
| 2223 |
},
|
| 2224 |
+
"execution_count": null,
|
| 2225 |
"outputs": []
|
| 2226 |
},
|
| 2227 |
{
|
|
|
|
| 2249 |
},
|
| 2250 |
"outputId": "bab3453d-5846-4518-be23-92cdc0d9e5a1"
|
| 2251 |
},
|
| 2252 |
+
"execution_count": null,
|
| 2253 |
"outputs": [
|
| 2254 |
{
|
| 2255 |
"output_type": "stream",
|
|
|
|
| 2345 |
},
|
| 2346 |
"outputId": "3f012250-0a09-492e-d675-3950ff447e5f"
|
| 2347 |
},
|
| 2348 |
+
"execution_count": null,
|
| 2349 |
"outputs": [
|
| 2350 |
{
|
| 2351 |
"output_type": "display_data",
|
|
|
|
| 2452 |
"metadata": {
|
| 2453 |
"id": "kxbraMeuvMni"
|
| 2454 |
},
|
| 2455 |
+
"execution_count": null,
|
| 2456 |
"outputs": []
|
| 2457 |
},
|
| 2458 |
{
|
|
|
|
| 2482 |
"metadata": {
|
| 2483 |
"id": "LZ1fD_VTvLdV"
|
| 2484 |
},
|
| 2485 |
+
"execution_count": null,
|
| 2486 |
"outputs": []
|
| 2487 |
},
|
| 2488 |
{
|
|
|
|
| 2506 |
"id": "0mRS1uGWPieQ",
|
| 2507 |
"outputId": "abb87538-5a2f-4be8-9a7e-fa860b872321"
|
| 2508 |
},
|
| 2509 |
+
"execution_count": null,
|
| 2510 |
"outputs": [
|
| 2511 |
{
|
| 2512 |
"output_type": "execute_result",
|
|
|
|
| 2606 |
"id": "n1FrIXFWRi_d",
|
| 2607 |
"outputId": "d3b3bc0c-058d-46bd-a1fb-df128f81acef"
|
| 2608 |
},
|
| 2609 |
+
"execution_count": null,
|
| 2610 |
"outputs": [
|
| 2611 |
{
|
| 2612 |
"output_type": "stream",
|
|
|
|
| 2648 |
"metadata": {
|
| 2649 |
"id": "ZF-ANJiLRTjE"
|
| 2650 |
},
|
| 2651 |
+
"execution_count": null,
|
| 2652 |
"outputs": []
|
| 2653 |
},
|
| 2654 |
{
|
|
|
|
| 2683 |
"metadata": {
|
| 2684 |
"id": "6HZUQjytSBfd"
|
| 2685 |
},
|
| 2686 |
+
"execution_count": null,
|
| 2687 |
"outputs": []
|
| 2688 |
},
|
| 2689 |
{
|
|
|
|
| 2706 |
"metadata": {
|
| 2707 |
"id": "RdAy8fwQSSSF"
|
| 2708 |
},
|
| 2709 |
+
"execution_count": null,
|
| 2710 |
"outputs": []
|
| 2711 |
},
|
| 2712 |
{
|
|
|
|
| 2717 |
"metadata": {
|
| 2718 |
"id": "132QpoZDScgw"
|
| 2719 |
},
|
| 2720 |
+
"execution_count": null,
|
| 2721 |
"outputs": []
|
| 2722 |
},
|
| 2723 |
{
|
|
|
|
| 2750 |
"metadata": {
|
| 2751 |
"id": "6V5JQ7pqS4GG"
|
| 2752 |
},
|
| 2753 |
+
"execution_count": null,
|
| 2754 |
"outputs": []
|
| 2755 |
},
|
| 2756 |
{
|
|
|
|
| 2772 |
"id": "fhhtCRo2S6hR",
|
| 2773 |
"outputId": "070f7b3e-48c0-46a1-f3cb-115a9e77fa97"
|
| 2774 |
},
|
| 2775 |
+
"execution_count": null,
|
| 2776 |
"outputs": [
|
| 2777 |
{
|
| 2778 |
"output_type": "stream",
|
|
|
|
| 2855 |
"base_uri": "https://localhost:8080/"
|
| 2856 |
},
|
| 2857 |
"id": "eX-_Cl9jVMQA",
|
| 2858 |
+
"outputId": "2ab6a027-9467-4183-875b-a55cd874d862"
|
| 2859 |
},
|
| 2860 |
+
"execution_count": null,
|
| 2861 |
"outputs": [
|
| 2862 |
{
|
| 2863 |
"output_type": "execute_result",
|
|
|
|
| 2873 |
]
|
| 2874 |
},
|
| 2875 |
"metadata": {},
|
| 2876 |
+
"execution_count": 41
|
| 2877 |
}
|
| 2878 |
]
|
| 2879 |
},
|
|
|
|
| 2902 |
},
|
| 2903 |
"execution_count": null,
|
| 2904 |
"outputs": []
|
| 2905 |
+
},
|
| 2906 |
+
{
|
| 2907 |
+
"cell_type": "markdown",
|
| 2908 |
+
"source": [
|
| 2909 |
+
"## Docs\n",
|
| 2910 |
+
"Get more info in below links!\n",
|
| 2911 |
+
"- [SAM2 docs](https://huggingface.co/docs/transformers/main/en/model_doc/sam2)\n",
|
| 2912 |
+
"- [KOSMOS2.5 docs](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5)\n",
|
| 2913 |
+
"- [Florence-2 docs](https://huggingface.co/docs/transformers/main/en/model_doc/florence2)\n",
|
| 2914 |
+
"- [DINOv3 docs](https://huggingface.co/docs/transformers/main/en/model_doc/dinov3)\n",
|
| 2915 |
+
"- [MetaCLIP2 docs](https://huggingface.co/docs/transformers/main/en/model_doc/metaclip_2)"
|
| 2916 |
+
],
|
| 2917 |
+
"metadata": {
|
| 2918 |
+
"id": "N3Yp8BZNd7aM"
|
| 2919 |
+
}
|
| 2920 |
}
|
| 2921 |
]
|
| 2922 |
}
|