{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "file_extension": ".py", "kernelspec": { "display_name": "Python 3.7.7 64-bit ('datasets': conda)", "language": "python", "name": "python37764bitdatasetscondae5d8ff60608e4c5c953d6bb643d8ebc5" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "mimetype": "text/x-python", "name": "python", "npconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3, "colab": { "name": "HuggingFace nlp library - Overview", "provenance": [], "toc_visible": true, "include_colab_link": true }, "widgets": { "application/vnd.jupyter.widget-state+json": { "3b0e2e80c42c4a27991f2fa3c427e861": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_2d6757ef18e448e9af7fb4d874ff6f9d", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_ed4ec751bf7d4d0d9643634c7adb9d56", "IPY_MODEL_d2c54ae694a3453d8f62dd8d07d4b356" ] } }, "2d6757ef18e448e9af7fb4d874ff6f9d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ed4ec751bf7d4d0d9643634c7adb9d56": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_04ba44b50cab4929940e8044d1fb350b", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 4997, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 4997, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_476623e4ec3845d6b2671f42bec9c7a1" } }, "d2c54ae694a3453d8f62dd8d07d4b356": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_ec220965dd954f89a66a00a5d18c7f80", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 5.00k/5.00k [00:03<00:00, 1.27kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_beb50276ed0642658057d665027d1009" } }, "04ba44b50cab4929940e8044d1fb350b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "476623e4ec3845d6b2671f42bec9c7a1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ec220965dd954f89a66a00a5d18c7f80": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "beb50276ed0642658057d665027d1009": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7f56212ab0e44b21bd09af4bce4fd64e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_1dfb3c7bb76d49e49fcc851f06bb18d2", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_2a41b6c083924c6f888e5b61b0fffdb8", "IPY_MODEL_f8b2492e22fa43e3aef3c7590ee851ef" ] } }, "1dfb3c7bb76d49e49fcc851f06bb18d2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "2a41b6c083924c6f888e5b61b0fffdb8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_8cefea541e2b4261a959eb11adc4e834", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 2240, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 2240, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_fcd5574dcbf64b8fb1ad6a3ce663fd3f" } }, "f8b2492e22fa43e3aef3c7590ee851ef": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_8d4b0c3b71404f7ebb0e9267888dde40", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 2.24k/2.24k [00:03<00:00, 667B/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_90ee191829f64774856a9323d0ae9e56" } }, "8cefea541e2b4261a959eb11adc4e834": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "fcd5574dcbf64b8fb1ad6a3ce663fd3f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8d4b0c3b71404f7ebb0e9267888dde40": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "90ee191829f64774856a9323d0ae9e56": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "899eba20bfe642d6819e70850a274f50": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_ef9113f13e9b462fbdf8aae0fff7a9f7", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_ecc67a44f8154fb4a89b6325ee3462fb", "IPY_MODEL_c6c40c130e7a4e7f9c69a2a1248f2e57" ] } }, "ef9113f13e9b462fbdf8aae0fff7a9f7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ecc67a44f8154fb4a89b6325ee3462fb": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_634ea5af72744cf69b152bc8149a6cea", "_dom_classes": [], "description": "Downloading: ", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 8116577, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 8116577, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_366b47f4ab784e0c85d9a644b548ecb0" } }, "c6c40c130e7a4e7f9c69a2a1248f2e57": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_0322a2345daa494585e3d37d22c91b40", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 30.3M/? [00:01<00:00, 23.8MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_94c0c72cf3ec43b6b3160607eb957a3d" } }, "634ea5af72744cf69b152bc8149a6cea": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "366b47f4ab784e0c85d9a644b548ecb0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "0322a2345daa494585e3d37d22c91b40": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "94c0c72cf3ec43b6b3160607eb957a3d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9a633bb0bfc3488e8904224af81ae165": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_c275e39a9e724417a0f21c767aad4222", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_9f343d5380c9419c8dd831f251a1fc81", "IPY_MODEL_c29ec644dbc444a793b92fbfa6d3e4bc" ] } }, "c275e39a9e724417a0f21c767aad4222": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9f343d5380c9419c8dd831f251a1fc81": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_49151a31158e4d71a9b4ad1b9b9fe1c8", "_dom_classes": [], "description": "Downloading: ", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 1054280, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1054280, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_2a949fb7059b48e7ac76cae962c50833" } }, "c29ec644dbc444a793b92fbfa6d3e4bc": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_e1483680a79748aa9de08569ba044038", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 4.85M/? [00:00<00:00, 11.1MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_cd3fede4d6cf496c95d108c7f6ed039b" } }, "49151a31158e4d71a9b4ad1b9b9fe1c8": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "2a949fb7059b48e7ac76cae962c50833": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "e1483680a79748aa9de08569ba044038": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "cd3fede4d6cf496c95d108c7f6ed039b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7c6b80079af245a68bccd2c9aec20f00": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_1fae38301f3b41bbbf4db13d2fe6032b", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_148572d3ac064d11b0425afada62ce70", "IPY_MODEL_f03399e96fb7491ab83f709c33c48622" ] } }, "1fae38301f3b41bbbf4db13d2fe6032b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "148572d3ac064d11b0425afada62ce70": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_16b696fec4ab44e9aa22687e403d9eed", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "info", "max": 1, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_5033d15d013b41bfb2988aebecaa0f2e" } }, "f03399e96fb7491ab83f709c33c48622": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_391ed86689b04856a458c929993afb52", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 87599/0 [00:04<00:00, 13300.43 examples/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_095b84a1c63149a0950d49310d5ccdd3" } }, "16b696fec4ab44e9aa22687e403d9eed": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "5033d15d013b41bfb2988aebecaa0f2e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "391ed86689b04856a458c929993afb52": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "095b84a1c63149a0950d49310d5ccdd3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "449cddc30e5a4755b9b28e030d86b78f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_dc05828e9d114ec2a293a0e7cee8b63f", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_9808e758d3e447bcbc388b85441538d3", "IPY_MODEL_9ecc410b1fdd4c2c89a34721c52dc464" ] } }, "dc05828e9d114ec2a293a0e7cee8b63f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9808e758d3e447bcbc388b85441538d3": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_c3fa7c31d80946dcae7d6e8b5564c02f", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "info", "max": 1, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_feba5a8a7b32462a8c4a585c62e12785" } }, "9ecc410b1fdd4c2c89a34721c52dc464": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_90c0b0457e194b75ae8adf1e13936b27", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 10570/0 [00:00<00:00, 24.93 examples/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_2f8cb0bb9f2a42b08cef846c5d3c8a52" } }, "c3fa7c31d80946dcae7d6e8b5564c02f": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "feba5a8a7b32462a8c4a585c62e12785": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "90c0b0457e194b75ae8adf1e13936b27": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "2f8cb0bb9f2a42b08cef846c5d3c8a52": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "c95167de0c674bbd88e95df1271dd9ab": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_000f6fb64c164486bdafe2c61860899a", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_c7304e5dece143a2a346762b482e4b8d", "IPY_MODEL_1ddcdd9899e344f981d06939882d574a" ] } }, "000f6fb64c164486bdafe2c61860899a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "c7304e5dece143a2a346762b482e4b8d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_6b0f78412d694cbb8d7d9dc465e8661d", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 213450, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 213450, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_3ae828d2a56c48f79c049c415defd21c" } }, "1ddcdd9899e344f981d06939882d574a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_1afa1c44c06a44e79adab422e14ff59a", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 213k/213k [00:00<00:00, 636kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e82d9d1e030f41128b265c738e83771e" } }, "6b0f78412d694cbb8d7d9dc465e8661d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "3ae828d2a56c48f79c049c415defd21c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "1afa1c44c06a44e79adab422e14ff59a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "e82d9d1e030f41128b265c738e83771e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "76f672fa3f5d4ee9a79409043a763938": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_57a5736e9d634e74995df300e07d53e4", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_fab57dec41a9439b9e822d20785f06c8", "IPY_MODEL_ccb3f04fa88d4e66a8c04d829de998e3" ] } }, "57a5736e9d634e74995df300e07d53e4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "fab57dec41a9439b9e822d20785f06c8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_d7213a66331e4b37af0e8c9a51a96592", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 411, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 411, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_37592a88c2e244a99ae281f4f127465b" } }, "ccb3f04fa88d4e66a8c04d829de998e3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_8fbb0ca9e6e7402bbeccdf822cfe5189", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 411/411 [00:00<00:00, 581B/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_00298c6910ec47da9b05014493f71545" } }, "d7213a66331e4b37af0e8c9a51a96592": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "37592a88c2e244a99ae281f4f127465b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8fbb0ca9e6e7402bbeccdf822cfe5189": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "00298c6910ec47da9b05014493f71545": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "3333a69504ea4ea98f20ccf31b54a96b": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_bc21e8ef90c54a0280ee97431f5404c9", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_ff60cb6f5b6d4f928d624b8d7bc96ac3", "IPY_MODEL_77044c08a02441b68c9bc78fd340df7d" ] } }, "bc21e8ef90c54a0280ee97431f5404c9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ff60cb6f5b6d4f928d624b8d7bc96ac3": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_67003b0a383a4456913fdd0930cfcefa", "_dom_classes": [], "description": "Downloading: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 263273408, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 263273408, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_0c72a266137a4ca199f13236b574f3e1" } }, "77044c08a02441b68c9bc78fd340df7d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_ba1efb6f24ac4209b1cae526a85ec4f1", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 263M/263M [04:01<00:00, 1.09MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_4e3757c6b2f04013b40cdd4d3a4d127f" } }, "67003b0a383a4456913fdd0930cfcefa": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "0c72a266137a4ca199f13236b574f3e1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ba1efb6f24ac4209b1cae526a85ec4f1": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "4e3757c6b2f04013b40cdd4d3a4d127f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "zNp6kK7OvSUg", "colab_type": "text" }, "source": [ "# HuggingFace `nlp` library - Quick overview\n", "\n", "Models come and go (linear models, LSTM, Transformers, ...) but two core elements have consistently been the beating heart of Natural Language Processing: Datasets & Metrics\n", "\n", "`nlp` is a lightweight and extensible library to easily share and load dataset and evaluation metrics, already providing access to ~100 datasets and ~10 evaluation metrics.\n", "\n", "The library has several interesting features (beside easy access to datasets/metrics):\n", "\n", "- Build-in interoperability with PyTorch, Tensorflow 2, Pandas and Numpy\n", "- Small and fast library with a transparent and pythonic API\n", "- Strive on large datasets: nlp naturally frees you from RAM memory limits, all datasets are memory-mapped on drive by default.\n", "- Smart caching with an intelligent `tf.data`-like cache: never wait for your data to process several times\n", "\n", "`nlp` originated from a fork of the awesome Tensorflow-Datasets and the HuggingFace team want to deeply thank the team behind this amazing library and user API. We have tried to keep a layer of compatibility with `tfds` and a conversion can provide conversion from one format to the other." ] }, { "cell_type": "markdown", "metadata": { "id": "dzk9aEtIvSUh", "colab_type": "text" }, "source": [ "# Main datasets API\n", "\n", "This notebook is a quick dive in the main user API for loading datasets in `nlp`" ] }, { "cell_type": "code", "metadata": { "id": "my95uHbLyjwR", "colab_type": "code", "outputId": "ec28598b-1af5-42e4-c87d-0ddfa9e63632", "colab": { "base_uri": "https://localhost:8080/", "height": 221 } }, "source": [ "# install nlp\n", "!pip install nlp\n", "\n", "# Make sure that we have a recent version of pyarrow in the session before we continue - otherwise reboot Colab to activate it\n", "import pyarrow\n", "if int(pyarrow.__version__.split('.')[1]) < 16 and int(pyarrow.__version__.split('.')[0]) == 0:\n", " import os\n", " os.kill(os.getpid(), 9)" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: nlp in /usr/local/lib/python3.6/dist-packages (0.2.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from nlp) (1.18.4)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.6/dist-packages (from nlp) (2.23.0)\n", "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from nlp) (0.7)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from nlp) (3.0.12)\n", "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from nlp) (0.3.1.1)\n", "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.6/dist-packages (from nlp) (0.17.1)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from nlp) (4.41.1)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->nlp) (1.24.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->nlp) (2020.4.5.1)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->nlp) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.19.0->nlp) (2.9)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "hJHyEmievSUh", "colab_type": "code", "outputId": "afc32e2a-6d42-4d77-fee6-0afdb5a1f206", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "import logging\n", "logging.basicConfig(level=logging.INFO)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "17\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "PVjXLiYxvSUl", "colab_type": "code", "outputId": "55de9ff7-4f91-4f82-f039-dda379e3aa0e", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "# Let's import the library\n", "import nlp" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.utils.file_utils:PyTorch version 1.5.0+cu101 available.\n", "INFO:nlp.utils.file_utils:TensorFlow version 2.2.0 available.\n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "TNloBBx-vSUo", "colab_type": "text" }, "source": [ "## Listing the currently available datasets and metrics" ] }, { "cell_type": "code", "metadata": { "id": "d3RJisGLvSUp", "colab_type": "code", "outputId": "2e07e795-e49c-4a95-e418-5a418e7820d0", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 } }, "source": [ "# Currently available datasets and metrics\n", "datasets = nlp.list_datasets()\n", "metrics = nlp.list_metrics()\n", "\n", "print(f\"🤩 Currently {len(datasets)} datasets are available on HuggingFace AWS bucket: \\n\" \n", " + '\\n'.join(dataset.id for dataset in datasets) + '\\n')\n", "print(f\"🤩 Currently {len(metrics)} metrics are available on HuggingFace AWS bucket: \\n\" \n", " + '\\n'.join(metric.id for metric in metrics))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "🤩 Currently 114 datasets are available on HuggingFace AWS bucket: \n", "aeslc\n", "ai2_arc\n", "anli\n", "arcd\n", "art\n", "billsum\n", "blimp\n", "blog_authorship_corpus\n", "boolq\n", "break_data\n", "c4\n", "cfq\n", "civil_comments\n", "cmrc2018\n", "cnn_dailymail\n", "coarse_discourse\n", "com_qa\n", "commonsense_qa\n", "coqa\n", "cornell_movie_dialog\n", "cos_e\n", "cosmos_qa\n", "crime_and_punish\n", "csv\n", "definite_pronoun_resolution\n", "discofuse\n", "drop\n", "empathetic_dialogues\n", "eraser_multi_rc\n", "esnli\n", "event2Mind\n", "flores\n", "fquad\n", "gap\n", "germeval_14\n", "gigaword\n", "glue\n", "hansards\n", "hellaswag\n", "imdb\n", "jeopardy\n", "json\n", "kor_nli\n", "lc_quad\n", "librispeech_lm\n", "lm1b\n", "math_dataset\n", "math_qa\n", "mlqa\n", "movie_rationales\n", "multi_news\n", "multi_nli\n", "multi_nli_mismatch\n", "natural_questions\n", "newsroom\n", "openbookqa\n", "opinosis\n", "para_crawl\n", "qa4mre\n", "qangaroo\n", "qanta\n", "qasc\n", "quarel\n", "quartz\n", "quoref\n", "race\n", "reclor\n", "reddit\n", "reddit_tifu\n", "scan\n", "scicite\n", "scientific_papers\n", "scifact\n", "sciq\n", "scitail\n", "sentiment140\n", "snli\n", "social_i_qa\n", "squad\n", "squad_es\n", "squad_it\n", "squad_v1_pt\n", "squad_v2\n", "super_glue\n", "ted_hrlr\n", "ted_multi\n", "tiny_shakespeare\n", "trivia_qa\n", "tydiqa\n", "ubuntu_dialogs_corpus\n", "webis/tl_dr\n", "wiki40b\n", "wiki_qa\n", "wiki_split\n", "wikihow\n", "wikipedia\n", "wikitext\n", "winogrande\n", "wiqa\n", "wmt14\n", "wmt15\n", "wmt16\n", "wmt17\n", "wmt18\n", "wmt19\n", "wmt_t2t\n", "wnut_17\n", "x_stance\n", "xcopa\n", "xnli\n", "xquad\n", "xsum\n", "xtreme\n", "yelp_polarity\n", "\n", "🤩 Currently 11 metrics are available on HuggingFace AWS bucket: \n", "bertscore\n", "bleu\n", "coval\n", "gleu\n", "glue\n", "rouge\n", "sacrebleu\n", "seqeval\n", "squad\n", "squad_v2\n", "xnli\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "7T5AG3BxvSUr", "colab_type": "code", "outputId": "7c450740-1166-4fe9-e999-7c0d35c1d073", "colab": { "base_uri": "https://localhost:8080/", "height": 343 } }, "source": [ "# You can read a few attributes of the datasets before loading them (they are python dataclasses)\n", "from dataclasses import asdict\n", "\n", "for key, value in asdict(datasets[6]).items():\n", " print('👉 ' + key + ': ' + str(value))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "👉 id: blimp\n", "👉 key: nlp/datasets/blimp/blimp.py\n", "👉 lastModified: 2020-05-14T14:57:19.000Z\n", "👉 description: BLiMP is a challenge set for evaluating what language models (LMs) know about\n", "major grammatical phenomena in English. BLiMP consists of 67 sub-datasets, each\n", "containing 1000 minimal pairs isolating specific contrasts in syntax,\n", "morphology, or semantics. The data is automatically generated according to\n", "expert-crafted grammars.\n", "👉 citation: @article{warstadt2019blimp,\n", " title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},\n", " author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},\n", " journal={arXiv preprint arXiv:1912.00582},\n", " year={2019}\n", "}\n", "👉 size: 7307\n", "👉 etag: \"3659a5abbb1ca837439d94aa2217c5f2\"\n", "👉 siblings: [{'key': 'nlp/datasets/blimp/blimp.py', 'etag': '\"3659a5abbb1ca837439d94aa2217c5f2\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 7307, 'rfilename': 'blimp.py'}, {'key': 'nlp/datasets/blimp/dataset_infos.json', 'etag': '\"c6427bb29472ce40e7317ffb2da3eb8c\"', 'lastModified': '2020-05-14T15:43:08.000Z', 'size': 140760, 'rfilename': 'dataset_infos.json'}, {'key': 'nlp/datasets/blimp/dummy/adjunct_island/0.1.0/dummy_data-zip-extracted/dummy_data/adjunct_island.jsonl', 'etag': '\"c4963f9c5fc4e06b345e8fa7dd5c0f75\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 1674, 'rfilename': 'dummy/adjunct_island/0.1.0/dummy_data-zip-extracted/dummy_data/adjunct_island.jsonl'}, {'key': 'nlp/datasets/blimp/dummy/adjunct_island/0.1.0/dummy_data.zip', 'etag': '\"4d9b4aebaabf5e8e879bd70c346ce444\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 693, 'rfilename': 'dummy/adjunct_island/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/anaphor_gender_agreement/0.1.0/dummy_data.zip', 'etag': '\"ae6d58b49a06df42d5e1a195d2554090\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 713, 'rfilename': 'dummy/anaphor_gender_agreement/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/anaphor_number_agreement/0.1.0/dummy_data.zip', 'etag': '\"407313731ea04097977cd570fb15fad6\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 713, 'rfilename': 'dummy/anaphor_number_agreement/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/animate_subject_passive/0.1.0/dummy_data.zip', 'etag': '\"72591df75a765e08304c38ef534707fe\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 711, 'rfilename': 'dummy/animate_subject_passive/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/animate_subject_trans/0.1.0/dummy_data.zip', 'etag': '\"3d28fc9d1d86d694eb93f594e4b6402b\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 707, 'rfilename': 'dummy/animate_subject_trans/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/causative/0.1.0/dummy_data.zip', 'etag': '\"f93eac2f0293e71d4ada42b11e45b76a\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 683, 'rfilename': 'dummy/causative/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/complex_NP_island/0.1.0/dummy_data.zip', 'etag': '\"a67e08505c35bdc5a7d309a9196984e5\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 699, 'rfilename': 'dummy/complex_NP_island/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/coordinate_structure_constraint_complex_left_branch/0.1.0/dummy_data.zip', 'etag': '\"ec1b42518b4dfb6665236bfd114d8296\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 767, 'rfilename': 'dummy/coordinate_structure_constraint_complex_left_branch/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/coordinate_structure_constraint_object_extraction/0.1.0/dummy_data.zip', 'etag': '\"158a029bb0fa569215686021e6c7ca6c\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 763, 'rfilename': 'dummy/coordinate_structure_constraint_object_extraction/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_1/0.1.0/dummy_data.zip', 'etag': '\"077540f947928995d7d7dac70599fc91\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 719, 'rfilename': 'dummy/determiner_noun_agreement_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_2/0.1.0/dummy_data.zip', 'etag': '\"4fea5ce64be1c0dd799e48cd4d3f11f0\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 719, 'rfilename': 'dummy/determiner_noun_agreement_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_irregular_1/0.1.0/dummy_data.zip', 'etag': '\"25c16f66bbe341f5d13915470d5a5826\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 739, 'rfilename': 'dummy/determiner_noun_agreement_irregular_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_irregular_2/0.1.0/dummy_data.zip', 'etag': '\"29708c976d3c7489a6723f0e4278a9ac\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 739, 'rfilename': 'dummy/determiner_noun_agreement_irregular_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_with_adj_2/0.1.0/dummy_data.zip', 'etag': '\"a78f9d469dbdf0a57f36b81330e71bdf\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 737, 'rfilename': 'dummy/determiner_noun_agreement_with_adj_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_with_adj_irregular_1/0.1.0/dummy_data.zip', 'etag': '\"c162a33eded087712aeb9b4b1ea2dbd7\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 757, 'rfilename': 'dummy/determiner_noun_agreement_with_adj_irregular_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_with_adj_irregular_2/0.1.0/dummy_data.zip', 'etag': '\"3a70bc956a304ba81d853dafe1de5d9e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 757, 'rfilename': 'dummy/determiner_noun_agreement_with_adj_irregular_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/determiner_noun_agreement_with_adjective_1/0.1.0/dummy_data.zip', 'etag': '\"a88a63f40066d2ae8f06528cc845f5d2\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 749, 'rfilename': 'dummy/determiner_noun_agreement_with_adjective_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/distractor_agreement_relational_noun/0.1.0/dummy_data.zip', 'etag': '\"9860891071b0473b79194f6e6565d4d5\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 737, 'rfilename': 'dummy/distractor_agreement_relational_noun/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/distractor_agreement_relative_clause/0.1.0/dummy_data.zip', 'etag': '\"bd95732a4096a1d44be5510e96e289e6\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 737, 'rfilename': 'dummy/distractor_agreement_relative_clause/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/drop_argument/0.1.0/dummy_data.zip', 'etag': '\"4246cbc981d1cbf5d0328416a52b8147\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 691, 'rfilename': 'dummy/drop_argument/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/ellipsis_n_bar_1/0.1.0/dummy_data.zip', 'etag': '\"6cbd75e3bb2c2d49a4fbe51471706a8f\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 697, 'rfilename': 'dummy/ellipsis_n_bar_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/ellipsis_n_bar_2/0.1.0/dummy_data.zip', 'etag': '\"10c1c36cda5f08686a2348c7d82f6306\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 697, 'rfilename': 'dummy/ellipsis_n_bar_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/existential_there_object_raising/0.1.0/dummy_data.zip', 'etag': '\"24793f8775ae82d66df3608f5b72d72f\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 729, 'rfilename': 'dummy/existential_there_object_raising/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/existential_there_quantifiers_1/0.1.0/dummy_data.zip', 'etag': '\"aa24879f651e050eacda5c25ffafa387\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 727, 'rfilename': 'dummy/existential_there_quantifiers_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/existential_there_quantifiers_2/0.1.0/dummy_data.zip', 'etag': '\"0705585e50d8f263f25867504c09c813\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 727, 'rfilename': 'dummy/existential_there_quantifiers_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/existential_there_subject_raising/0.1.0/dummy_data.zip', 'etag': '\"fda3b97d380f5176f731ffda924ba9de\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 731, 'rfilename': 'dummy/existential_there_subject_raising/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/expletive_it_object_raising/0.1.0/dummy_data.zip', 'etag': '\"2106674a6f17e4908c152fee32016a05\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 719, 'rfilename': 'dummy/expletive_it_object_raising/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/inchoative/0.1.0/dummy_data.zip', 'etag': '\"24aba4b2aab4b1934d018d65e66fb47b\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 685, 'rfilename': 'dummy/inchoative/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/intransitive/0.1.0/dummy_data.zip', 'etag': '\"21f3f19e72493b429e30c159f5e30a51\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 689, 'rfilename': 'dummy/intransitive/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/irregular_past_participle_adjectives/0.1.0/dummy_data.zip', 'etag': '\"c586a9a3b493cc3f0e68a24bb666857f\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 737, 'rfilename': 'dummy/irregular_past_participle_adjectives/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/irregular_past_participle_verbs/0.1.0/dummy_data.zip', 'etag': '\"f81d92fc913257f1957b97b51b80ad59\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 727, 'rfilename': 'dummy/irregular_past_participle_verbs/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/irregular_plural_subject_verb_agreement_1/0.1.0/dummy_data.zip', 'etag': '\"960ff5ce201f4f3f40b90a939913b540\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 747, 'rfilename': 'dummy/irregular_plural_subject_verb_agreement_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/irregular_plural_subject_verb_agreement_2/0.1.0/dummy_data.zip', 'etag': '\"9ba898073d2a5505c3b8556f4b55f8fb\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 747, 'rfilename': 'dummy/irregular_plural_subject_verb_agreement_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/left_branch_island_echo_question/0.1.0/dummy_data.zip', 'etag': '\"a5bb57f0ea9714fbabba9224d33c4e56\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 729, 'rfilename': 'dummy/left_branch_island_echo_question/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/left_branch_island_simple_question/0.1.0/dummy_data.zip', 'etag': '\"be10d837c20742074d65597ccd374340\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 733, 'rfilename': 'dummy/left_branch_island_simple_question/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/matrix_question_npi_licensor_present/0.1.0/dummy_data.zip', 'etag': '\"300e1321aa464419c048c360591e0c1e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 737, 'rfilename': 'dummy/matrix_question_npi_licensor_present/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/npi_present_1/0.1.0/dummy_data.zip', 'etag': '\"7447d7643c939ac3c4bdf8c2b1e0784a\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 691, 'rfilename': 'dummy/npi_present_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/npi_present_2/0.1.0/dummy_data.zip', 'etag': '\"a991d676c4d73aefa976390d5d7a6941\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 691, 'rfilename': 'dummy/npi_present_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/only_npi_licensor_present/0.1.0/dummy_data.zip', 'etag': '\"7c53926215e0bc66c7094d1f06c315e2\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 715, 'rfilename': 'dummy/only_npi_licensor_present/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/only_npi_scope/0.1.0/dummy_data.zip', 'etag': '\"613051e1d03e43f4b9ec943050ae5cc4\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 693, 'rfilename': 'dummy/only_npi_scope/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/passive_1/0.1.0/dummy_data.zip', 'etag': '\"caba86741ee749d10da8c6a71343cd0e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 683, 'rfilename': 'dummy/passive_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/passive_2/0.1.0/dummy_data.zip', 'etag': '\"ce805ec1ecd2e1aaa02dfe7a35c08b13\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 683, 'rfilename': 'dummy/passive_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_c_command/0.1.0/dummy_data.zip', 'etag': '\"a221171ea87e33f5b6c0f846a79e2dd0\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 707, 'rfilename': 'dummy/principle_A_c_command/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_case_1/0.1.0/dummy_data.zip', 'etag': '\"3e83c3688b974786fd9d8d5209a0476b\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 701, 'rfilename': 'dummy/principle_A_case_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_case_2/0.1.0/dummy_data.zip', 'etag': '\"4dea33e17e8ef8f8c0d872757c13b3d6\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 701, 'rfilename': 'dummy/principle_A_case_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_domain_1/0.1.0/dummy_data.zip', 'etag': '\"7900b1564e4c529efb41d82be6809a57\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 705, 'rfilename': 'dummy/principle_A_domain_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_domain_2/0.1.0/dummy_data.zip', 'etag': '\"23ae73bd517a5bf0f6f2f0f59be52789\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 705, 'rfilename': 'dummy/principle_A_domain_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_domain_3/0.1.0/dummy_data.zip', 'etag': '\"ee728b9ae19c9948e2a71583a6b13999\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 705, 'rfilename': 'dummy/principle_A_domain_3/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/principle_A_reconstruction/0.1.0/dummy_data.zip', 'etag': '\"87e77eb32d448ecf248d790233972129\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 717, 'rfilename': 'dummy/principle_A_reconstruction/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/regular_plural_subject_verb_agreement_1/0.1.0/dummy_data.zip', 'etag': '\"89e82be1dcccf4356842072a4f32bf5c\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 743, 'rfilename': 'dummy/regular_plural_subject_verb_agreement_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/regular_plural_subject_verb_agreement_2/0.1.0/dummy_data.zip', 'etag': '\"2c9e5b90e6f63af758fa496463cd9fb4\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 743, 'rfilename': 'dummy/regular_plural_subject_verb_agreement_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/sentential_negation_npi_licensor_present/0.1.0/dummy_data.zip', 'etag': '\"1a1cf51238c21622ffc5fc49aa4b1a92\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 745, 'rfilename': 'dummy/sentential_negation_npi_licensor_present/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/sentential_negation_npi_scope/0.1.0/dummy_data.zip', 'etag': '\"657e58f6efdc4fe8044b3a98d101f513\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 723, 'rfilename': 'dummy/sentential_negation_npi_scope/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/sentential_subject_island/0.1.0/dummy_data.zip', 'etag': '\"2025fd4b889dff3071d901d769ae2dfc\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 715, 'rfilename': 'dummy/sentential_subject_island/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/superlative_quantifiers_1/0.1.0/dummy_data.zip', 'etag': '\"3e88f881e2c5b140a637cf77c5aac68e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 715, 'rfilename': 'dummy/superlative_quantifiers_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/superlative_quantifiers_2/0.1.0/dummy_data.zip', 'etag': '\"9d94cb3368370bddeff5173d4631d72f\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 715, 'rfilename': 'dummy/superlative_quantifiers_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/tough_vs_raising_1/0.1.0/dummy_data.zip', 'etag': '\"e69cdfb4f2e71152048dca408916f89e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 701, 'rfilename': 'dummy/tough_vs_raising_1/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/tough_vs_raising_2/0.1.0/dummy_data.zip', 'etag': '\"581dc9ecd1c28099badda01a8a31d976\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 701, 'rfilename': 'dummy/tough_vs_raising_2/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/transitive/0.1.0/dummy_data.zip', 'etag': '\"93e8e5ab817c9169f13bada583fba7d5\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 685, 'rfilename': 'dummy/transitive/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_island/0.1.0/dummy_data.zip', 'etag': '\"985dbc9e4984c1d17e6a274eea718cb6\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 683, 'rfilename': 'dummy/wh_island/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_questions_object_gap/0.1.0/dummy_data.zip', 'etag': '\"7604d63509f82b6447f060210d5a33d3\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 711, 'rfilename': 'dummy/wh_questions_object_gap/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_questions_subject_gap/0.1.0/dummy_data.zip', 'etag': '\"188e6f8d00454d16258e82664cfa1d19\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 713, 'rfilename': 'dummy/wh_questions_subject_gap/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_questions_subject_gap_long_distance/0.1.0/dummy_data.zip', 'etag': '\"9882a65b7e9b43c8ee27d95f6cc7e7d8\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 741, 'rfilename': 'dummy/wh_questions_subject_gap_long_distance/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_vs_that_no_gap/0.1.0/dummy_data.zip', 'etag': '\"b18933a136b6f1505a0cf2c1bf015d1a\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 699, 'rfilename': 'dummy/wh_vs_that_no_gap/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_vs_that_no_gap_long_distance/0.1.0/dummy_data.zip', 'etag': '\"56f297a9bef3726085ad06039f9b676e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 727, 'rfilename': 'dummy/wh_vs_that_no_gap_long_distance/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_vs_that_with_gap/0.1.0/dummy_data.zip', 'etag': '\"ecaf2faccae7968d073c717f32904436\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 703, 'rfilename': 'dummy/wh_vs_that_with_gap/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/dummy/wh_vs_that_with_gap_long_distance/0.1.0/dummy_data.zip', 'etag': '\"553219792801242448c22bfe4a5c5566\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 731, 'rfilename': 'dummy/wh_vs_that_with_gap_long_distance/0.1.0/dummy_data.zip'}, {'key': 'nlp/datasets/blimp/urls_checksums/checksums.txt', 'etag': '\"eed3a912a5a68248de27d7fa1c540c8e\"', 'lastModified': '2020-05-14T14:57:19.000Z', 'size': 11410, 'rfilename': 'urls_checksums/checksums.txt'}]\n", "👉 author: None\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "9uqSkkSovSUt", "colab_type": "text" }, "source": [ "## An example with SQuAD" ] }, { "cell_type": "code", "metadata": { "id": "aOXl6afcvSUu", "colab_type": "code", "outputId": "b8963101-14c7-42c6-e4dc-f14ba68e36d7", "colab": { "base_uri": "https://localhost:8080/", "height": 964, "referenced_widgets": [ "3b0e2e80c42c4a27991f2fa3c427e861", "2d6757ef18e448e9af7fb4d874ff6f9d", "ed4ec751bf7d4d0d9643634c7adb9d56", "d2c54ae694a3453d8f62dd8d07d4b356", "04ba44b50cab4929940e8044d1fb350b", "476623e4ec3845d6b2671f42bec9c7a1", "ec220965dd954f89a66a00a5d18c7f80", "beb50276ed0642658057d665027d1009", "7f56212ab0e44b21bd09af4bce4fd64e", "1dfb3c7bb76d49e49fcc851f06bb18d2", "2a41b6c083924c6f888e5b61b0fffdb8", "f8b2492e22fa43e3aef3c7590ee851ef", "8cefea541e2b4261a959eb11adc4e834", "fcd5574dcbf64b8fb1ad6a3ce663fd3f", "8d4b0c3b71404f7ebb0e9267888dde40", "90ee191829f64774856a9323d0ae9e56", "899eba20bfe642d6819e70850a274f50", "ef9113f13e9b462fbdf8aae0fff7a9f7", "ecc67a44f8154fb4a89b6325ee3462fb", "c6c40c130e7a4e7f9c69a2a1248f2e57", "634ea5af72744cf69b152bc8149a6cea", "366b47f4ab784e0c85d9a644b548ecb0", "0322a2345daa494585e3d37d22c91b40", "94c0c72cf3ec43b6b3160607eb957a3d", "9a633bb0bfc3488e8904224af81ae165", "c275e39a9e724417a0f21c767aad4222", "9f343d5380c9419c8dd831f251a1fc81", "c29ec644dbc444a793b92fbfa6d3e4bc", "49151a31158e4d71a9b4ad1b9b9fe1c8", "2a949fb7059b48e7ac76cae962c50833", "e1483680a79748aa9de08569ba044038", "cd3fede4d6cf496c95d108c7f6ed039b", "7c6b80079af245a68bccd2c9aec20f00", "1fae38301f3b41bbbf4db13d2fe6032b", "148572d3ac064d11b0425afada62ce70", "f03399e96fb7491ab83f709c33c48622", "16b696fec4ab44e9aa22687e403d9eed", "5033d15d013b41bfb2988aebecaa0f2e", "391ed86689b04856a458c929993afb52", "095b84a1c63149a0950d49310d5ccdd3", "449cddc30e5a4755b9b28e030d86b78f", "dc05828e9d114ec2a293a0e7cee8b63f", "9808e758d3e447bcbc388b85441538d3", "9ecc410b1fdd4c2c89a34721c52dc464", "c3fa7c31d80946dcae7d6e8b5564c02f", "feba5a8a7b32462a8c4a585c62e12785", "90c0b0457e194b75ae8adf1e13936b27", "2f8cb0bb9f2a42b08cef846c5d3c8a52" ] } }, "source": [ "# Downloading and loading a dataset\n", "\n", "dataset = nlp.load_dataset('squad', split='validation[:10%]')" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:filelock:Lock 139884110310704 acquired on /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py.lock\n", "INFO:nlp.utils.file_utils:https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/tmpd52q9bes\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3b0e2e80c42c4a27991f2fa3c427e861", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4997.0, style=ProgressStyle(description…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:nlp.utils.file_utils:storing https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py in cache at /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py\n", "INFO:nlp.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py\n", "INFO:filelock:Lock 139884110310704 released on /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py.lock\n", "INFO:filelock:Lock 139886448054000 acquired on /root/.cache/huggingface/datasets/9ba53336b6bc977097b39b8527b06ec6ba3f60a44230f2a0a918735fcd8ad902.893fb39fe374e4c574667dd71a3017b7e2e1d196f3a34fb00b56bac805447f7c.lock\n", "INFO:nlp.utils.file_utils:https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/dataset_infos.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/tmp9kaastvw\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7f56212ab0e44b21bd09af4bce4fd64e", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2240.0, style=ProgressStyle(description…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:nlp.utils.file_utils:storing https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/dataset_infos.json in cache at /root/.cache/huggingface/datasets/9ba53336b6bc977097b39b8527b06ec6ba3f60a44230f2a0a918735fcd8ad902.893fb39fe374e4c574667dd71a3017b7e2e1d196f3a34fb00b56bac805447f7c\n", "INFO:nlp.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/9ba53336b6bc977097b39b8527b06ec6ba3f60a44230f2a0a918735fcd8ad902.893fb39fe374e4c574667dd71a3017b7e2e1d196f3a34fb00b56bac805447f7c\n", "INFO:filelock:Lock 139886448054000 released on /root/.cache/huggingface/datasets/9ba53336b6bc977097b39b8527b06ec6ba3f60a44230f2a0a918735fcd8ad902.893fb39fe374e4c574667dd71a3017b7e2e1d196f3a34fb00b56bac805447f7c.lock\n", "INFO:nlp.load:Checking /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py for additional imports.\n", "INFO:filelock:Lock 139886448054000 acquired on /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py.lock\n", "INFO:nlp.load:Found main folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py at /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad\n", "INFO:nlp.load:Found specific version folder for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py at /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad/c0327553d80335e3a3283527f64d9778df7ad04ab28f38148d072782712bb670\n", "INFO:nlp.load:Found script file from https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py to /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad/c0327553d80335e3a3283527f64d9778df7ad04ab28f38148d072782712bb670/squad.py\n", "INFO:nlp.load:Copying dataset infos file from https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/dataset_infos.json to /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad/c0327553d80335e3a3283527f64d9778df7ad04ab28f38148d072782712bb670/dataset_infos.json\n", "INFO:nlp.load:Creating metadata file for dataset https://s3.amazonaws.com/datasets.huggingface.co/nlp/datasets/squad/squad.py at /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad/c0327553d80335e3a3283527f64d9778df7ad04ab28f38148d072782712bb670/squad.json\n", "INFO:filelock:Lock 139886448054000 released on /root/.cache/huggingface/datasets/09ec6948d9db29db9a2dcd08df97ac45bccfa6aa104ea62d73c97fa4aaa5cd6c.f373b0de1570ca81b50bb03bd371604f7979e35de2cfcf2a3b4521d0b3104d9b.py.lock\n", "INFO:nlp.builder:No config specified, defaulting to first: squad/plain_text\n", "INFO:nlp.info:Loading Dataset Infos from /usr/local/lib/python3.6/dist-packages/nlp/datasets/squad/c0327553d80335e3a3283527f64d9778df7ad04ab28f38148d072782712bb670\n", "INFO:nlp.builder:Generating dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0)\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "INFO:nlp.builder:Dataset not on Hf google storage. Downloading and preparing it from source\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, total: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0...\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "INFO:filelock:Lock 139884104848496 acquired on /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.961f90ccac96b3e5df3c9ebb533f58da8f3ae596f5418c74cc814af15b348739.lock\n", "INFO:nlp.utils.file_utils:https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmp74l2ywcp\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "899eba20bfe642d6819e70850a274f50", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=8116577.0, style=ProgressStyle(descript…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:nlp.utils.file_utils:storing https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json in cache at /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.961f90ccac96b3e5df3c9ebb533f58da8f3ae596f5418c74cc814af15b348739\n", "INFO:nlp.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.961f90ccac96b3e5df3c9ebb533f58da8f3ae596f5418c74cc814af15b348739\n", "INFO:filelock:Lock 139884104848496 released on /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.961f90ccac96b3e5df3c9ebb533f58da8f3ae596f5418c74cc814af15b348739.lock\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "INFO:filelock:Lock 139884104848328 acquired on /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.01470d0bbaa4753fc1435055451f474b824c23e0dc139470f39b1f233bde8747.lock\n", "INFO:nlp.utils.file_utils:https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpfhou_5to\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9a633bb0bfc3488e8904224af81ae165", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1054280.0, style=ProgressStyle(descript…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:nlp.utils.file_utils:storing https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json in cache at /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.01470d0bbaa4753fc1435055451f474b824c23e0dc139470f39b1f233bde8747\n", "INFO:nlp.utils.file_utils:creating metadata file for /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.01470d0bbaa4753fc1435055451f474b824c23e0dc139470f39b1f233bde8747\n", "INFO:filelock:Lock 139884104848328 released on /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.01470d0bbaa4753fc1435055451f474b824c23e0dc139470f39b1f233bde8747.lock\n", "INFO:nlp.utils.info_utils:All the checksums matched successfully.\n", "INFO:nlp.builder:Generating split train\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7c6b80079af245a68bccd2c9aec20f00", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:root:generating examples from = /root/.cache/huggingface/datasets/downloads/b8bb19735e1bb591510a01cc032f4c9f969bc0eeb081ae1b328cd306f3b24008.961f90ccac96b3e5df3c9ebb533f58da8f3ae596f5418c74cc814af15b348739\n", "INFO:nlp.arrow_writer:Done writing 87599 examples in 79317110 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0.incomplete/squad-train.arrow.\n", "INFO:nlp.builder:Generating split validation\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\r" ], "name": "stdout" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "449cddc30e5a4755b9b28e030d86b78f", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:root:generating examples from = /root/.cache/huggingface/datasets/downloads/9d5462987ef5f814fe15a369c1724f6ec39a2018b3b6271a9d7d2598686ca2ff.01470d0bbaa4753fc1435055451f474b824c23e0dc139470f39b1f233bde8747\n", "INFO:nlp.arrow_writer:Done writing 10570 examples in 10472653 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0.incomplete/squad-validation.arrow.\n", "INFO:nlp.utils.info_utils:All the splits matched successfully.\n", "INFO:nlp.builder:Constructing Dataset for split validation[:10%], from /root/.cache/huggingface/datasets/squad/plain_text/1.0.0\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\rDataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0. Subsequent calls will reuse this data.\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "rQ0G-eK3vSUw", "colab_type": "text" }, "source": [ "This call to `nlp.load_dataset()` does the following steps under the hood:\n", "\n", "1. Download and import in the library the **SQuAD python processing script** from HuggingFace AWS bucket if it's not already stored in the library. You can find the SQuAD processing script [here](https://github.com/huggingface/nlp/tree/master/datasets/squad/squad.py) for instance.\n", "\n", " Processing scripts are small python scripts which define the info (citation, description) and format of the dataset and contain the URL to the original SQuAD JSON files and the code to load examples from the original SQuAD JSON files.\n", "\n", "\n", "2. Run the SQuAD python processing script which will:\n", " - **Download the SQuAD dataset** from the original URL (see the script) if it's not already downloaded and cached.\n", " - **Process and cache** all SQuAD in a structured Arrow table for each standard splits stored on the drive.\n", "\n", " Arrow table are arbitrarly long tables, typed with types that can be mapped to numpy/pandas/python standard types and can store nested objects. They can be directly access from drive, loaded in RAM or even streamed over the web.\n", " \n", "\n", "3. Return a **dataset build from the splits** asked by the user (default: all), in the above example we create a dataset with the first 10% of the validation split." ] }, { "cell_type": "code", "metadata": { "id": "fercoFwLvSUx", "colab_type": "code", "outputId": "e1d020a3-140c-4e0b-98af-c815d544be0e", "colab": { "base_uri": "https://localhost:8080/", "height": 479 } }, "source": [ "# Informations on the dataset (description, citation, size, splits, format...)\n", "# are provided in `dataset.info` (as a python dataclass)\n", "for key, value in asdict(dataset.info).items():\n", " print('👉 ' + key + ': ' + str(value))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "👉 description: Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n", "\n", "👉 citation: @article{2016arXiv160605250R,\n", " author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},\n", " Konstantin and {Liang}, Percy},\n", " title = \"{SQuAD: 100,000+ Questions for Machine Comprehension of Text}\",\n", " journal = {arXiv e-prints},\n", " year = 2016,\n", " eid = {arXiv:1606.05250},\n", " pages = {arXiv:1606.05250},\n", "archivePrefix = {arXiv},\n", " eprint = {1606.05250},\n", "}\n", "\n", "👉 homepage: https://rajpurkar.github.io/SQuAD-explorer/\n", "👉 license: \n", "👉 features: {'id': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'title': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'context': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'question': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'answers': {'feature': {'text': {'dtype': 'string', 'id': None, '_type': 'Value'}, 'answer_start': {'dtype': 'int32', 'id': None, '_type': 'Value'}}, 'length': -1, 'id': None, '_type': 'Sequence'}}\n", "👉 supervised_keys: None\n", "👉 builder_name: squad\n", "👉 config_name: plain_text\n", "👉 version: {'version_str': '1.0.0', 'description': 'New split API (https://tensorflow.org/datasets/splits)', 'nlp_version_to_prepare': None, 'major': 1, 'minor': 0, 'patch': 0}\n", "👉 splits: {'train': {'name': 'train', 'num_bytes': 79317110, 'num_examples': 87599, 'dataset_name': 'squad'}, 'validation': {'name': 'validation', 'num_bytes': 10472653, 'num_examples': 10570, 'dataset_name': 'squad'}}\n", "👉 download_checksums: {'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json': {'num_bytes': 30288272, 'checksum': '3527663986b8295af4f7fcdff1ba1ff3f72d07d61a20f487cb238a6ef92fd955'}, 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json': {'num_bytes': 4854279, 'checksum': '95aa6a52d5d6a735563366753ca50492a658031da74f301ac5238b03966972c9'}}\n", "👉 download_size: 35142551\n", "👉 dataset_size: 89789763\n", "👉 size_in_bytes: 124932314\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "GE0E87zsvSUz", "colab_type": "text" }, "source": [ "## Inspecting and using the dataset: elements, slices and columns" ] }, { "cell_type": "markdown", "metadata": { "id": "DKf4YFnevSU0", "colab_type": "text" }, "source": [ "The returned `Dataset` object is a memory mapped dataset that behave similarly to a normal map-style dataset. It is backed by an Apache Arrow table which allows many interesting features." ] }, { "cell_type": "code", "metadata": { "id": "tP1xPqSyvSU0", "colab_type": "code", "outputId": "1b337b0b-3039-4c36-c938-595cfc966473", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "print(dataset)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Dataset(schema: {'id': 'string', 'title': 'string', 'context': 'string', 'question': 'string', 'answers': 'struct, answer_start: list>'}, num_rows: 1057)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "aiO3rC8yvSU2", "colab_type": "text" }, "source": [ "You can query it's length and get items or slices like you would do normally with a python mapping." ] }, { "cell_type": "code", "metadata": { "id": "xxLcdj2yvSU3", "colab_type": "code", "outputId": "88be3642-c132-492c-a001-f3b479c50226", "colab": { "base_uri": "https://localhost:8080/", "height": 374 } }, "source": [ "from pprint import pprint\n", "\n", "print(f\"👉Dataset len(dataset): {len(dataset)}\")\n", "print(\"\\n👉First item 'dataset[0]':\")\n", "pprint(dataset[0])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "👉Dataset len(dataset): 1057\n", "\n", "👉First item 'dataset[0]':\n", "{'answers': {'answer_start': [177, 177, 177],\n", " 'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},\n", " 'context': 'Super Bowl 50 was an American football game to determine the '\n", " 'champion of the National Football League (NFL) for the 2015 '\n", " 'season. The American Football Conference (AFC) champion Denver '\n", " 'Broncos defeated the National Football Conference (NFC) champion '\n", " 'Carolina Panthers 24–10 to earn their third Super Bowl title. The '\n", " \"game was played on February 7, 2016, at Levi's Stadium in the San \"\n", " 'Francisco Bay Area at Santa Clara, California. As this was the '\n", " '50th Super Bowl, the league emphasized the \"golden anniversary\" '\n", " 'with various gold-themed initiatives, as well as temporarily '\n", " 'suspending the tradition of naming each Super Bowl game with '\n", " 'Roman numerals (under which the game would have been known as '\n", " '\"Super Bowl L\"), so that the logo could prominently feature the '\n", " 'Arabic numerals 50.',\n", " 'id': '56be4db0acb8001400a502ec',\n", " 'question': 'Which NFL team represented the AFC at Super Bowl 50?',\n", " 'title': 'Super_Bowl_50'}\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "zk1WQ_cczP5w", "colab_type": "code", "outputId": "5871e9f5-2cbd-4162-c262-3b0ca5e2be4c", "colab": { "base_uri": "https://localhost:8080/", "height": 748 } }, "source": [ "# Or get slices with several examples:\n", "print(\"\\n👉Slice of the two items 'dataset[10:12]':\")\n", "pprint(dataset[10:12])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "\n", "👉Slice of the two items 'dataset[10:12]':\n", "OrderedDict([('id', ['56bea9923aeaaa14008c91bb', '56beace93aeaaa14008c91df']),\n", " ('title', ['Super_Bowl_50', 'Super_Bowl_50']),\n", " ('context',\n", " ['Super Bowl 50 was an American football game to determine the '\n", " 'champion of the National Football League (NFL) for the 2015 '\n", " 'season. The American Football Conference (AFC) champion Denver '\n", " 'Broncos defeated the National Football Conference (NFC) '\n", " 'champion Carolina Panthers 24–10 to earn their third Super '\n", " \"Bowl title. The game was played on February 7, 2016, at Levi's \"\n", " 'Stadium in the San Francisco Bay Area at Santa Clara, '\n", " 'California. As this was the 50th Super Bowl, the league '\n", " 'emphasized the \"golden anniversary\" with various gold-themed '\n", " 'initiatives, as well as temporarily suspending the tradition '\n", " 'of naming each Super Bowl game with Roman numerals (under '\n", " 'which the game would have been known as \"Super Bowl L\"), so '\n", " 'that the logo could prominently feature the Arabic numerals '\n", " '50.',\n", " 'Super Bowl 50 was an American football game to determine the '\n", " 'champion of the National Football League (NFL) for the 2015 '\n", " 'season. The American Football Conference (AFC) champion Denver '\n", " 'Broncos defeated the National Football Conference (NFC) '\n", " 'champion Carolina Panthers 24–10 to earn their third Super '\n", " \"Bowl title. The game was played on February 7, 2016, at Levi's \"\n", " 'Stadium in the San Francisco Bay Area at Santa Clara, '\n", " 'California. As this was the 50th Super Bowl, the league '\n", " 'emphasized the \"golden anniversary\" with various gold-themed '\n", " 'initiatives, as well as temporarily suspending the tradition '\n", " 'of naming each Super Bowl game with Roman numerals (under '\n", " 'which the game would have been known as \"Super Bowl L\"), so '\n", " 'that the logo could prominently feature the Arabic numerals '\n", " '50.']),\n", " ('question',\n", " ['What day was the Super Bowl played on?',\n", " 'Who won Super Bowl 50?']),\n", " ('answers',\n", " [{'answer_start': [334, 334, 334],\n", " 'text': ['February 7, 2016', 'February 7', 'February 7, 2016']},\n", " {'answer_start': [177, 177, 177],\n", " 'text': ['Denver Broncos',\n", " 'Denver Broncos',\n", " 'Denver Broncos']}])])\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "QXj2Qr5KvSU5", "colab_type": "code", "outputId": "89582743-4758-404c-bd84-41b256e3e28c", "colab": { "base_uri": "https://localhost:8080/", "height": 54 } }, "source": [ "# You can get a full column of the dataset by indexing with its name as a string:\n", "print(dataset['question'][:10])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "['Which NFL team represented the AFC at Super Bowl 50?', 'Which NFL team represented the NFC at Super Bowl 50?', 'Where did Super Bowl 50 take place?', 'Which NFL team won Super Bowl 50?', 'What color was used to emphasize the 50th anniversary of the Super Bowl?', 'What was the theme of Super Bowl 50?', 'What day was the game played on?', 'What is the AFC short for?', 'What was the theme of Super Bowl 50?', 'What does AFC stand for?']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "6Au7rqPMvSU7", "colab_type": "text" }, "source": [ "The `__getitem__` method will return different format depending on the type of query:\n", "\n", "- Items like `dataset[0]` are returned as dict of elements.\n", "- Slices like `dataset[10:20]` are returned as dict of lists of elements.\n", "- Columns like `dataset['question']` are returned as a list of elements.\n", "\n", "This may seems surprising at first but in our experiments it's actually a lot easier to use for data processing than returning the same format for each of these views on the dataset." ] }, { "cell_type": "markdown", "metadata": { "id": "6DB_y79cvSU8", "colab_type": "text" }, "source": [ "In particular, you can easily iterate along columns in slices, and also naturally permute consecutive indexings with identical results as showed here by permuting column indexing with elements and slices:" ] }, { "cell_type": "code", "metadata": { "id": "wjGocqArvSU9", "colab_type": "code", "outputId": "701c294a-6ca4-4b30-9472-48e2dd482ed5", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "print(dataset[0]['question'] == dataset['question'][0])\n", "print(dataset[10:20]['context'] == dataset['context'][10:20])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "True\n", "True\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "b1-Kj1xQvSU_", "colab_type": "text" }, "source": [ "### Dataset are internally typed and structured\n", "\n", "The dataset is backed by one (or several) Apache Arrow tables which are typed and allows for fast retrieval and access as well as arbitrary-size memory mapping.\n", "\n", "This means respectively that the format for the dataset is clearly defined and that you can load datasets of arbitrary size without worrying about RAM memory limitation (basically the dataset take no space in RAM, it's directly read from drive when needed with fast IO access)." ] }, { "cell_type": "code", "metadata": { "id": "rAnp_RyPvSVA", "colab_type": "code", "outputId": "7a5b1d76-08ca-4b65-93e9-2f3c47fdc34b", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# You can inspect the dataset column names and type \n", "print(dataset.column_names)\n", "print(dataset.schema)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "['id', 'title', 'context', 'question', 'answers']\n", "id: string not null\n", "title: string not null\n", "context: string not null\n", "question: string not null\n", "answers: struct, answer_start: list> not null\n", " child 0, text: list\n", " child 0, item: string\n", " child 1, answer_start: list\n", " child 0, item: int32\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "au4v3mOQvSVC", "colab_type": "text" }, "source": [ "### Additional misc properties" ] }, { "cell_type": "code", "metadata": { "id": "efFhDWhlvSVC", "colab_type": "code", "outputId": "1cc484cc-951a-4363-858f-1e5d5fe6c935", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Datasets also have a bunch of properties you can access\n", "print(\"The number of bytes allocated on the drive is \", dataset.nbytes)\n", "print(\"For comparison, here is the number of bytes allocated in memory which can be\")\n", "print(\"accessed with `nlp.total_allocated_bytes()`: \", nlp.total_allocated_bytes())\n", "print(\"The number of rows\", dataset.num_rows)\n", "print(\"The number of columns\", dataset.num_columns)\n", "print(\"The shape (rows, columns)\", dataset.shape)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "The number of bytes allocated on the drive is 9855914\n", "For comparison, here is the number of bytes allocated in memory which can be\n", "accessed with `nlp.total_allocated_bytes()`: 0\n", "The number of rows 1057\n", "The number of columns 5\n", "The shape (rows, columns) (1057, 5)\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "o2_FBqAQvSVE", "colab_type": "text" }, "source": [ "### Additional misc methods" ] }, { "cell_type": "code", "metadata": { "id": "SznY_XqGvSVF", "colab_type": "code", "outputId": "fd888cf7-ac31-490f-f8f3-4b1a2c02d3ad", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# We can list the unique elements in a column. This is done by the backend (so fast!)\n", "print(f\"dataset.unique('title'): {dataset.unique('title')}\")\n", "\n", "# This will drop the column 'id'\n", "dataset.drop('id') # Remove column 'id'\n", "print(f\"After dataset.drop('id'), remaining columns are {dataset.column_names}\")\n", "\n", "# This will flatten nested columns (in 'answers' in our case)\n", "dataset.flatten()\n", "print(f\"After dataset.flatten(), column names are {dataset.column_names}\")\n", "\n", "# We can also \"dictionnary encode\" a column if many of it's elements are similar\n", "# This will reduce it's size by only storing the distinct elements (e.g. string)\n", "# It only has effect on the internal storage (no difference from a user point of view)\n", "dataset.dictionary_encode_column('title')" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "dataset.unique('title'): ['Super_Bowl_50', 'Warsaw']\n", "After dataset.drop('id'), remaining columns are ['title', 'context', 'question', 'answers']\n", "After dataset.flatten(), column names are ['title', 'context', 'question', 'answers.text', 'answers.answer_start']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "QdyuKs4VvSVH", "colab_type": "text" }, "source": [ "## Cache\n", "\n", "`nlp` datasets are backed by Apache Arrow cache files which allows:\n", "- to load arbitrary large datasets by using [memory mapping](https://en.wikipedia.org/wiki/Memory-mapped_file) (as long as the datasets can fit on the drive)\n", "- to use a fast backend to process the dataset efficiently\n", "- to do smart caching by storing and reusing the results of operations performed on the drive\n", "\n", "Let's dive a bit in these parts now" ] }, { "cell_type": "markdown", "metadata": { "id": "9fUcKwcbvSVH", "colab_type": "text" }, "source": [ "You can check the current cache files backing the dataset with the `.cache_file` property" ] }, { "cell_type": "code", "metadata": { "id": "zu8TgHTYvSVI", "colab_type": "code", "outputId": "25a377eb-b4a6-4bdb-efd1-0f1e4952da81", "colab": { "base_uri": "https://localhost:8080/", "height": 68 } }, "source": [ "dataset.cache_files" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "({'filename': '/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/squad-validation.arrow',\n", " 'skip': 0,\n", " 'take': 1057},)" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "LjeICK5GvSVK", "colab_type": "text" }, "source": [ "You can clean up the cache files in the current dataset directory (only keeping the currently used one) with `.cleanup_cache_files()`.\n", "\n", "Be careful that no other process is using some other cache files when running this command." ] }, { "cell_type": "code", "metadata": { "id": "3_WNU3dwvSVL", "colab_type": "code", "outputId": "92679df6-9491-4ec1-bdb6-7dd8d0cab6f2", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "dataset.cleanup_cache_files() # Returns the number of removed cache files" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Listing files in /root/.cache/huggingface/datasets/squad/plain_text/1.0.0\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/plain": [ "0" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "markdown", "metadata": { "id": "1Ox7ppKDvSVN", "colab_type": "text" }, "source": [ "## Modifying the dataset with `dataset.map`\n", "\n", "There is a powerful method `.map()` which is inspired by `tf.data` map method and that you can use to apply a function to each examples, independently or in batch." ] }, { "cell_type": "code", "metadata": { "id": "Yz2-27HevSVN", "colab_type": "code", "outputId": "41902dc5-e99f-485b-e707-9533cc4b1e14", "colab": { "base_uri": "https://localhost:8080/", "height": 71 } }, "source": [ "# `.map()` takes a callable accepting a dict as argument\n", "# (same dict as returned by dataset[i])\n", "# and iterate over the dataset by calling the function with each example.\n", "\n", "# Let's print the length of each `context` string in our subset of the dataset\n", "# (10% of the validation i.e. 1057 examples)\n", "\n", "dataset.map(lambda example: print(len(example['context']), end=','))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "775,775," ], "name": "stdout" }, { "output_type": "stream", "text": [ "\r0it [00:00, ?it/s]" ], "name": "stderr" }, { "output_type": "stream", "text": [ "775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,775,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,637,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,347,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,394,179,179,179,179,179,179,179,179,179,179,179,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,638,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,326,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,704,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1166,1166,1166,1166,1166,1166,1166,1166,1166,1166,1166,1166,1166,1166,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,2060,929,929,929,929,929,929,929,929,929,929,929,929,929,929,929,929,929,929,929,704,704,704,704,704,704,704,704,704,704,704,704,704,704,353,353,353,353,353,353,353,353,353,353,353,353,353,353,353,464,464,464,464,464,464,464,464,464,464,464,464,464,464,464,464,306,306,306,306,306,306,306,306,306,306,306,306,372,372,372,372,372,372,372,372,372,372,372,372,372,372,372,372,372,496,496,496,496,496,496,496,496,496,496,496,496,496,496,496,260,260,260,260,260,260,260,260,260,874,874,874,874,874,874,874,874,874,874,874,874,874,874,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,176,176,176,176,176,176,176,176,176,176,176,176,176,176,176,176,782,782,782,782,782,782,782,782,782,782,782,782,782,782,782,782,536,536,536,536,536,536,536,536,536,666,666,666,666,666,666,666,666,666,666,666,666,666,666,666,666,666,495,495,495,495,495,495,495,495,495,495,495,385,385,385,385,385,385,385,385,385,385,385,385,385,385,385,385,385,385,385,441,441,441,441,441,441,441,441,441,441,441,357,357,357,357,357,357,357,357,357,296,296,296,296,296,296,296,296,296,296,644,644,644,644,644,644,644,644,644,644,644,644,644,644,644,644,644,804," ], "name": "stdout" }, { "output_type": "stream", "text": [ "\r637it [00:00, 6365.64it/s]" ], "name": "stderr" }, { "output_type": "stream", "text": [ "804,804,804,804,804,804,804,804,804,804,397,397,397,397,397,397,397,397,397,397,397,397,397,397,360,360,360,360,360,360,360,973,973,973,973,973,973,973,973,973,973,973,973,973,973,263,263,263,263,263,263,263,263,263,263,263,568,568,568,568,568,568,568,568,568,568,568,264,264,264,264,264,264,264,264,264,264,264,264,264,264,264,892,892,892,892,892,892,892,892,892,892,892,206,206,206,206,206,489,489,489,489,489,489,489,489,489,489,489,489,489,181,181,181,181,181,181,181,181,181,181,181,181,531,531,531,531,531,531,531,531,531,531,531,531,664,664,664,664,664,664,664,664,664,664,664,664,664,664,672,672,672,672,672,672,672,672,672,672,672,672,672,672,858,858,858,858,858,858,858,858,858,858,858,858,634,634,634,634,634,634,634,634,634,634,634,634,634,634,891,891,891,891,891,891,891,891,891,891,891,891,891,488,488,488,488,488,488,488,488,488,488,488,488,942,942,942,942,942,942,942,942,942,942,942,942,942,942,942,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1162,1353,1353,1353,1353,1353,1353,1353,1353,1353,1353,1353,1353,1353,1353,522,522,522,522,522,1643,1643,1643,1643,1643,628,628,628,628,628,758,758,758,758,758,883,883,883,883,883,559,559,559,559,559,603,603,603,603,631,631,631,631,631,626,626,626,626,626,541,541,541,541,541,795,795,795,795,795,591,591,591,591,591,568,568,568,568,568,536,536,536,536,536,575,575,575,575,575,571,571,571,571,571,641,641,641,641,641,665,665,665,665,665" ], "name": "stdout" }, { "output_type": "stream", "text": [ "\r899it [00:00, 4413.66it/s]" ], "name": "stderr" }, { "output_type": "stream", "text": [ ",1088,1088,1088,1088,1088,1619,1619,1619,1619,1619,939,939,939,939,939,865,865,865,865,865,711,711,711,711,711,831,831,831,831,831,501,501,501,501,501,676,676,676,676,676,854,854,854,854,854,784,784,784,784,784,641,641,641,641,641,544,544,544,544,544,918,918,918,918,918,763,763,763,763,763,906,906,906,906,906,632,632,632,632,632,869,869,869,869,869,1044,1044,1044,1044,1044,760,760,760,760,760,715,715,715,715,715,838,838,838,838,838,881,881,881,881,881,940,940,940,940,940,618,618,618,618,618,1205,1205,1205,534,534,534,534,534,757,757,757,757,757,1239,1239,1239,1239,1239,609,609,609,609,609,798,798,798,798,798,613,613,613,613,613,613," ], "name": "stdout" }, { "output_type": "stream", "text": [ "\r1057it [00:00, 4215.63it/s]" ], "name": "stderr" }, { "output_type": "stream", "text": [ "613,613,613,613," ], "name": "stdout" }, { "output_type": "stream", "text": [ "\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/plain": [ "Dataset(schema: {'title': 'string', 'context': 'string', 'question': 'string', 'answers.text': 'list', 'answers.answer_start': 'list'}, num_rows: 1057)" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "markdown", "metadata": { "id": "Ta3celHnvSVP", "colab_type": "text" }, "source": [ "This is basically the same as doing\n", "\n", "```python\n", "for example in dataset:\n", " function(example)\n", "```" ] }, { "cell_type": "markdown", "metadata": { "id": "i_Ouw5gDvSVP", "colab_type": "text" }, "source": [ "The above example had no effect on the dataset because the method we supplied to `.map()` didn't return a `dict` or a `abc.Mapping` that could be used to update the examples in the dataset.\n", "\n", "In such a case, `.map()` will return the same dataset (`self`).\n", "\n", "Now let's see how we can use a method that actually modify the dataset." ] }, { "cell_type": "markdown", "metadata": { "id": "cEnCi9DFvSVQ", "colab_type": "text" }, "source": [ "### Modifying the dataset example by example" ] }, { "cell_type": "markdown", "metadata": { "id": "kA37VgZhvSVQ", "colab_type": "text" }, "source": [ "The main interest of `.map()` is to update and modify the content of the table and leverage smart caching and fast backend.\n", "\n", "To use `.map()` to update elements in the table you need to provide a function with the following signature: `function(example: dict) -> dict`." ] }, { "cell_type": "code", "metadata": { "id": "vUr65K-4vSVQ", "colab_type": "code", "outputId": "9eb1516f-21e1-4cd6-d095-dfea304b5ac4", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Let's add a prefix 'My cute title: ' to each of our titles\n", "\n", "def add_prefix_to_title(example):\n", " example['title'] = 'My cute title: ' + example['title']\n", " return example\n", "\n", "dataset = dataset.map(add_prefix_to_title)\n", "\n", "print(dataset.unique('title'))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-7fc546b401ec7a73d642e3460f4bcaa3.arrow\n", "1057it [00:00, 13900.01it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 905032 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-7fc546b401ec7a73d642e3460f4bcaa3.arrow.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "['My cute title: Super_Bowl_50', 'My cute title: Warsaw']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "FcZ_amDAvSVS", "colab_type": "text" }, "source": [ "This call to `.map()` compute and return the updated table. It will also store the updated table in a cache file indexed by the current state and the mapped function.\n", "\n", "A subsequent call to `.map()` (even in another python session) will reuse the cached file instead of recomputing the operation.\n", "\n", "You can test this by running again the previous cell, you will see that the result are directly loaded from the cache and not re-computed again.\n", "\n", "The updated dataset returned by `.map()` is (again) directly memory mapped from drive and not allocated in RAM." ] }, { "cell_type": "markdown", "metadata": { "id": "Skbf8LUEvSVT", "colab_type": "text" }, "source": [ "The function you provide to `.map()` should accept an input with the format of an item of the dataset: `function(dataset[0])` and return a python dict.\n", "\n", "The columns and type of the outputs can be different than the input dict. In this case the new keys will be added as additional columns in the dataset.\n", "\n", "Bascially each dataset example dict is updated with the dictionary returned by the function like this: `example.update(function(example))`." ] }, { "cell_type": "code", "metadata": { "id": "d5De0CfTvSVT", "colab_type": "code", "outputId": "0ae16b0d-efd3-443c-fd80-2d777bce1f29", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Since the input example dict is updated with our function output dict,\n", "# we can actually just return the updated 'title' field\n", "dataset = dataset.map(lambda example: {'title': 'My cutest title: ' + example['title']})\n", "\n", "print(dataset.unique('title'))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-e254729a165001477fc910898551132f.arrow\n", "1057it [00:00, 12758.48it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 923001 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-e254729a165001477fc910898551132f.arrow.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "['My cutest title: My cute title: Super_Bowl_50', 'My cutest title: My cute title: Warsaw']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "Q5vny56-vSVV", "colab_type": "text" }, "source": [ "#### Removing columns\n", "You can also remove columns when running map with the `remove_columns=List[str]` argument." ] }, { "cell_type": "code", "metadata": { "id": "-sPWnsz-vSVW", "colab_type": "code", "outputId": "6ee9e668-b083-420a-de69-2d4e31d24b2c", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# This will remove the 'title' column while doing the update (after having send it the the mapped function so you can use it in your function!)\n", "dataset = dataset.map(lambda example: {'new_title': 'Wouhahh: ' + example['title']},\n", " remove_columns=['title'])\n", "\n", "print(dataset.column_names)\n", "print(dataset.unique('new_title'))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-319ffdab1a236b2101739c4b33dc26d8.arrow\n", "1057it [00:00, 12976.87it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 932514 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-319ffdab1a236b2101739c4b33dc26d8.arrow.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "['context', 'question', 'answers.text', 'answers.answer_start', 'new_title']\n", "['Wouhahh: My cutest title: My cute title: Super_Bowl_50', 'Wouhahh: My cutest title: My cute title: Warsaw']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "G459HzD-vSVY", "colab_type": "text" }, "source": [ "#### Using examples indices\n", "With `with_indices=True`, dataset indices (from `0` to `len(dataset)`) will be supplied to the function which must thus have the following signature: `function(example: dict, indice: int) -> dict`" ] }, { "cell_type": "code", "metadata": { "id": "_kFL37R2vSVY", "colab_type": "code", "outputId": "9f625775-18c1-485a-dd2b-7f3f9fa6df4a", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# This will add the index in the dataset to the 'question' field\n", "dataset = dataset.map(lambda example, idx: {'question': f'{idx}: ' + example['question']},\n", " with_indices=True)\n", "\n", "print('\\n'.join(dataset['question'][:5]))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-0d7046ac832c326979b2f70469eac9fa.arrow\n", "1057it [00:00, 13039.70it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 937746 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-0d7046ac832c326979b2f70469eac9fa.arrow.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "0: Which NFL team represented the AFC at Super Bowl 50?\n", "1: Which NFL team represented the NFC at Super Bowl 50?\n", "2: Where did Super Bowl 50 take place?\n", "3: Which NFL team won Super Bowl 50?\n", "4: What color was used to emphasize the 50th anniversary of the Super Bowl?\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "xckhVEWFvSVb", "colab_type": "text" }, "source": [ "### Modifying the dataset with batched updates" ] }, { "cell_type": "markdown", "metadata": { "id": "dzmicbSnvSVb", "colab_type": "text" }, "source": [ "`.map()` can also work with batch of examples (slices of the dataset).\n", "\n", "This is particularly interesting if you have a function that can handle batch of inputs like the tokenizers of HuggingFace `tokenizers`.\n", "\n", "To work on batched inputs set `batched=True` when calling `.map()` and supply a function with the following signature: `function(examples: Dict[List]) -> Dict[List]` or, if you use indices, `function(examples: Dict[List], indices: List[int]) -> Dict[List]`).\n", "\n", "Bascially, your function should accept an input with the format of a slice of the dataset: `function(dataset[:10])`." ] }, { "cell_type": "code", "metadata": { "id": "pxHbgSTL0itj", "colab_type": "code", "outputId": "6ada38ca-af2d-4935-acbc-72993e93b37e", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "!pip install transformers" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Collecting transformers\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)\n", "\u001b[K |████████████████████████████████| 665kB 3.5MB/s \n", "\u001b[?25hCollecting sentencepiece\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n", "\u001b[K |████████████████████████████████| 1.1MB 17.6MB/s \n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.4)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", "Collecting sacremoses\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", "\u001b[K |████████████████████████████████| 890kB 25.9MB/s \n", "\u001b[?25hRequirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", "Collecting tokenizers==0.7.0\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)\n", "\u001b[K |████████████████████████████████| 3.8MB 34.4MB/s \n", "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.15.1)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.4.5.1)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.9)\n", "Building wheels for collected packages: sacremoses\n", " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893260 sha256=7a20f1b539ae5c37ce1c58b61c7f0f1d942d292d2fa7f27f45cb064a66621ea5\n", " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", "Successfully built sacremoses\n", "Installing collected packages: sentencepiece, sacremoses, tokenizers, transformers\n", "Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.7.0 transformers-2.10.0\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "T7gpEg0yvSVc", "colab_type": "code", "outputId": "4d36e8a9-03c3-40f1-84dc-87285a34aa7a", "colab": { "base_uri": "https://localhost:8080/", "referenced_widgets": [ "c95167de0c674bbd88e95df1271dd9ab", "000f6fb64c164486bdafe2c61860899a", "c7304e5dece143a2a346762b482e4b8d", "1ddcdd9899e344f981d06939882d574a", "6b0f78412d694cbb8d7d9dc465e8661d", "3ae828d2a56c48f79c049c415defd21c", "1afa1c44c06a44e79adab422e14ff59a", "e82d9d1e030f41128b265c738e83771e" ] } }, "source": [ "# Let's import a fast tokenizer that can work on batched inputs\n", "# (the 'Fast' tokenizers in HuggingFace)\n", "from transformers import BertTokenizerFast\n", "\n", "tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:transformers.file_utils:PyTorch version 1.5.0+cu101 available.\n", "INFO:transformers.file_utils:TensorFlow version 2.2.0 available.\n", "INFO:filelock:Lock 139884348804680 acquired on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock\n", "INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpbrrc_uwe\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c95167de0c674bbd88e95df1271dd9ab", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt in cache at /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n", "INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n", "INFO:filelock:Lock 139884348804680 released on /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1.lock\n", "INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /root/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "fAmLTPC9vSVe", "colab_type": "code", "outputId": "9fecf3e7-c00e-4c72-ba27-00d2362fa341", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Now let's batch tokenize our dataset 'context'\n", "dataset = dataset.map(lambda example: tokenizer.batch_encode_plus(example['context']),\n", " batched=True)\n", "\n", "print(\"dataset[0]\", dataset[0])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-4c8436e14fee9674f678b8735b43c65e.arrow\n", "100%|██████████| 2/2 [00:00<00:00, 3.54it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 4749270 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-4c8436e14fee9674f678b8735b43c65e.arrow.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "dataset[0] {'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the \"golden anniversary\" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \"Super Bowl L\"), so that the logo could prominently feature the Arabic numerals 50.', 'question': '0: Which NFL team represented the AFC at Super Bowl 50?', 'answers.text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'answers.answer_start': [177, 177, 177], 'new_title': 'Wouhahh: My cutest title: My cute title: Super_Bowl_50', 'input_ids': [101, 3198, 5308, 1851, 1108, 1126, 1237, 1709, 1342, 1106, 4959, 1103, 3628, 1104, 1103, 1305, 2289, 1453, 113, 4279, 114, 1111, 1103, 1410, 1265, 119, 1109, 1237, 2289, 3047, 113, 10402, 114, 3628, 7068, 14722, 2378, 1103, 1305, 2289, 3047, 113, 24743, 114, 3628, 2938, 13598, 1572, 782, 1275, 1106, 7379, 1147, 1503, 3198, 5308, 1641, 119, 1109, 1342, 1108, 1307, 1113, 1428, 128, 117, 1446, 117, 1120, 12388, 112, 188, 3339, 1107, 1103, 1727, 2948, 2410, 3894, 1120, 3364, 10200, 117, 1756, 119, 1249, 1142, 1108, 1103, 13163, 3198, 5308, 117, 1103, 2074, 13463, 1103, 107, 5404, 5453, 107, 1114, 1672, 2284, 118, 12005, 11751, 117, 1112, 1218, 1112, 7818, 28117, 20080, 16264, 1103, 3904, 1104, 10505, 1296, 3198, 5308, 1342, 1114, 2264, 183, 15447, 16179, 113, 1223, 1134, 1103, 1342, 1156, 1138, 1151, 1227, 1112, 107, 3198, 5308, 149, 107, 114, 117, 1177, 1115, 1103, 7998, 1180, 15199, 2672, 1103, 4944, 183, 15447, 16179, 1851, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "kNaJdKskvSVf", "colab_type": "code", "outputId": "67d19737-a55e-4fd0-f047-06b533455b21", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# we have added additional columns\n", "print(dataset.column_names)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "['context', 'question', 'answers.text', 'answers.answer_start', 'new_title', 'input_ids', 'token_type_ids', 'attention_mask']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "m3To8ztMvSVj", "colab_type": "code", "outputId": "74569713-2fd2-4b13-e6dc-e09da60db749", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Let show a more complex processing with the full preparation of the SQuAD dataset\n", "# for training a model from Transformers\n", "def convert_to_features(batch):\n", " # Tokenize contexts and questions (as pairs of inputs)\n", " # keep offset mappings for evaluation\n", " input_pairs = list(zip(batch['context'], batch['question']))\n", " encodings = tokenizer.batch_encode_plus(input_pairs,\n", " pad_to_max_length=True,\n", " return_offsets_mapping=True)\n", "\n", " # Compute start and end tokens for labels\n", " start_positions, end_positions = [], []\n", " for i, (text, start) in enumerate(zip(batch['answers.text'], batch['answers.answer_start'])):\n", " first_char = start[0]\n", " last_char = first_char + len(text[0]) - 1\n", " start_positions.append(encodings.char_to_token(i, first_char))\n", " end_positions.append(encodings.char_to_token(i, last_char))\n", "\n", " encodings.update({'start_positions': start_positions, 'end_positions': end_positions})\n", " return encodings\n", "\n", "dataset = dataset.map(convert_to_features, batched=True)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-3cceeef76f89add124dd3c1c12d2f776.arrow\n", "100%|██████████| 2/2 [00:00<00:00, 2.50it/s]\n", "INFO:nlp.arrow_writer:Done writing 1057 examples in 21643250 bytes /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/cache-3cceeef76f89add124dd3c1c12d2f776.arrow.\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "KBnmSa46vSVl", "colab_type": "code", "outputId": "cc6157c2-d6e5-441e-cb52-d13a2086696b", "colab": { "base_uri": "https://localhost:8080/" } }, "source": [ "# Now our dataset comprise the labels for the start and end position\n", "# as well as the offsets for converting back tokens\n", "# in span of the original string for evaluation\n", "print(\"column_names\", dataset.column_names)\n", "print(\"start_positions\", dataset[:5]['start_positions'])" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "column_names ['context', 'question', 'answers.text', 'answers.answer_start', 'new_title', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions']\n", "start_positions [34, 45, 80, 34, 98]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "NzOXxNzQvSVo", "colab_type": "text" }, "source": [ "## Formating outputs for numpy/torch/tensorflow\n", "\n", "Now that we have tokenized our inputs, we probably want to use this dataset in a `torch.Dataloader` or a `tf.data.Dataset`.\n", "\n", "To be able to do this we need to tweak two things:\n", "\n", "- format the indexing (`__getitem__`) to return numpy/pytorch/tensorflow tensors, instead of python objects, and probably\n", "- format the indexing (`__getitem__`) to return only the subset of the columns that we need for our model inputs.\n", "\n", " We don't want the columns `id` or `title` as inputs to train our model, but we could still want to keep them in the dataset, for instance for the evaluation of the model.\n", " \n", "This is handled by the `.set_format(type: Union[None, str], columns: Union[None, str, List[str]])` where:\n", "\n", "- `type` define the return type for our dataset `__getitem__` method and is one of `[None, 'numpy', 'pandas', 'torch', 'tensorflow']` (`None` means return python objects), and\n", "- `columns` define the columns returned by `__getitem__` and takes the name of a column in the dataset or a list of columns to return (`None` means return all columns)." ] }, { "cell_type": "code", "metadata": { "id": "aU2h_qQDvSVo", "colab_type": "code", "outputId": "19b279c9-c2be-4ddd-be6a-817e353b5d31", "colab": { "base_uri": "https://localhost:8080/", "height": 139 } }, "source": [ "columns_to_return = ['input_ids', 'token_type_ids', 'attention_mask',\n", " 'start_positions', 'end_positions']\n", "\n", "dataset.set_format(type='torch',\n", " columns=columns_to_return)\n", "\n", "# Our dataset indexing output is now ready for being used in a pytorch dataloader\n", "print('\\n'.join([' '.join((n, str(type(t)), str(t.shape))) for n, t in dataset[:10].items()]))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Set __getitem__(key) output type to torch for ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'] columns (when key is int or slice) and don't output other (un-formated) columns.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "input_ids torch.Size([10, 451])\n", "token_type_ids torch.Size([10, 451])\n", "attention_mask torch.Size([10, 451])\n", "start_positions torch.Size([10])\n", "end_positions torch.Size([10])\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "Wj1ukGIuvSVq", "colab_type": "code", "outputId": "5c0be879-b2f3-4cc9-ede2-4a9a2b731f6d", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Note that the columns are not removed from the dataset, just not returned when calling __getitem__\n", "# Similarly the inner type of the dataset is not changed to torch.Tensor, the conversion and filtering is done on-the-fly when querying the dataset\n", "print(dataset.column_names)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "['context', 'question', 'answers.text', 'answers.answer_start', 'new_title', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "pWmmUnlpvSVs", "colab_type": "code", "outputId": "392e2624-488d-4a3e-b94c-932d57f7b9d4", "colab": { "base_uri": "https://localhost:8080/", "height": 221 } }, "source": [ "# We can remove the formating with `.reset_format()`\n", "# or, identically, a call to `.set_format()` with no arguments\n", "dataset.reset_format()\n", "\n", "print('\\n'.join([' '.join((n, str(type(t)))) for n, t in dataset[:10].items()]))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:nlp.arrow_dataset:Set __getitem__(key) output type to python objects for no columns (when key is int or slice) and don't output other (un-formated) columns.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "context \n", "question \n", "answers.text \n", "answers.answer_start \n", "new_title \n", "input_ids \n", "token_type_ids \n", "attention_mask \n", "offset_mapping \n", "start_positions \n", "end_positions \n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "VyUOA07svSVu", "colab_type": "code", "outputId": "c8612c99-dfe5-488d-dd41-07c6f68901a5", "colab": { "base_uri": "https://localhost:8080/", "height": 238 } }, "source": [ "# The current format can be checked with `.format`,\n", "# which is a dict of the type and formating\n", "dataset.format" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'columns': ['context',\n", " 'question',\n", " 'answers.text',\n", " 'answers.answer_start',\n", " 'new_title',\n", " 'input_ids',\n", " 'token_type_ids',\n", " 'attention_mask',\n", " 'offset_mapping',\n", " 'start_positions',\n", " 'end_positions'],\n", " 'output_all_columns': False,\n", " 'type': 'python'}" ] }, "metadata": { "tags": [] }, "execution_count": 32 } ] }, { "cell_type": "markdown", "metadata": { "id": "xyi2eMeSvSVv", "colab_type": "text" }, "source": [ "# Wrapping this all up (PyTorch)\n", "\n", "Let's wrap this all up with the full code to load and prepare SQuAD for training a PyTorch model from HuggingFace `transformers` library.\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "l0j8BPLi6Qlv", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 306 }, "outputId": "9103326b-cba4-4fc2-9ee0-5ec10938c540" }, "source": [ "!pip install transformers" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (2.10.0)\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.6/dist-packages (from transformers) (0.1.91)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.4)\n", "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n", "Requirement already satisfied: tokenizers==0.7.0 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7.0)\n", "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.15.1)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.9)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.4.5.1)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "QvExTIZWvSVw", "colab_type": "code", "colab": {} }, "source": [ "import nlp\n", "import torch \n", "from transformers import BertTokenizerFast\n", "\n", "# Load our training dataset and tokenizer\n", "dataset = nlp.load_dataset('squad')\n", "tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')\n", "\n", "def get_correct_alignement(context, answer):\n", " \"\"\" Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. \"\"\"\n", " gold_text = answer['text'][0]\n", " start_idx = answer['answer_start'][0]\n", " end_idx = start_idx + len(gold_text)\n", " if context[start_idx:end_idx] == gold_text:\n", " return start_idx, end_idx # When the gold label position is good\n", " elif context[start_idx-1:end_idx-1] == gold_text:\n", " return start_idx-1, end_idx-1 # When the gold label is off by one character\n", " elif context[start_idx-2:end_idx-2] == gold_text:\n", " return start_idx-2, end_idx-2 # When the gold label is off by two character\n", " else:\n", " raise ValueError()\n", "\n", "# Tokenize our training dataset\n", "def convert_to_features(example_batch):\n", " # Tokenize contexts and questions (as pairs of inputs)\n", " input_pairs = list(zip(example_batch['context'], example_batch['question']))\n", " encodings = tokenizer.batch_encode_plus(input_pairs, pad_to_max_length=True)\n", "\n", " # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.\n", " start_positions, end_positions = [], []\n", " for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):\n", " start_idx, end_idx = get_correct_alignement(context, answer)\n", " start_positions.append(encodings.char_to_token(i, start_idx))\n", " end_positions.append(encodings.char_to_token(i, end_idx-1))\n", " encodings.update({'start_positions': start_positions,\n", " 'end_positions': end_positions})\n", " return encodings\n", "\n", "dataset['train'] = dataset['train'].map(convert_to_features, batched=True)\n", "\n", "# Format our dataset to outputs torch.Tensor to train a pytorch model\n", "columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']\n", "dataset['train'].set_format(type='torch', columns=columns)\n", "\n", "# Instantiate a PyTorch Dataloader around our dataset\n", "dataloader = torch.utils.data.DataLoader(dataset['train'], batch_size=8)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "4mHnwMx2vSVx", "colab_type": "code", "outputId": "178ed3de-321c-424d-d55e-2014eb43cc5f", "colab": { "base_uri": "https://localhost:8080/", "height": 866, "referenced_widgets": [ "76f672fa3f5d4ee9a79409043a763938", "57a5736e9d634e74995df300e07d53e4", "fab57dec41a9439b9e822d20785f06c8", "ccb3f04fa88d4e66a8c04d829de998e3", "d7213a66331e4b37af0e8c9a51a96592", "37592a88c2e244a99ae281f4f127465b", "8fbb0ca9e6e7402bbeccdf822cfe5189", "00298c6910ec47da9b05014493f71545", "3333a69504ea4ea98f20ccf31b54a96b", "bc21e8ef90c54a0280ee97431f5404c9", "ff60cb6f5b6d4f928d624b8d7bc96ac3", "77044c08a02441b68c9bc78fd340df7d", "67003b0a383a4456913fdd0930cfcefa", "0c72a266137a4ca199f13236b574f3e1", "ba1efb6f24ac4209b1cae526a85ec4f1", "4e3757c6b2f04013b40cdd4d3a4d127f" ] } }, "source": [ "# Let's load a pretrained Bert model and a simple optimizer\n", "from transformers import BertForQuestionAnswering\n", "\n", "model = BertForQuestionAnswering.from_pretrained('distilbert-base-cased')\n", "optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "INFO:filelock:Lock 139884094601256 acquired on /root/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494.lock\n", "INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp1uhk_b1k\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "76f672fa3f5d4ee9a79409043a763938", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json in cache at /root/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494\n", "INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494\n", "INFO:filelock:Lock 139884094601256 released on /root/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494.lock\n", "INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json from cache at /root/.cache/torch/transformers/774d52b0be7c2f621ac9e64708a8b80f22059f6d0e264e1bdc4f4d71c386c4ea.f44aaaab97e2ee0f8d9071a5cd694e19bf664237a92aea20ebe04ddb7097b494\n", "INFO:transformers.configuration_utils:Model config BertConfig {\n", " \"activation\": \"gelu\",\n", " \"attention_dropout\": 0.1,\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"dim\": 768,\n", " \"dropout\": 0.1,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dim\": 3072,\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"n_heads\": 12,\n", " \"n_layers\": 6,\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"output_past\": true,\n", " \"pad_token_id\": 0,\n", " \"qa_dropout\": 0.1,\n", " \"seq_classif_dropout\": 0.2,\n", " \"sinusoidal_pos_embds\": false,\n", " \"tie_weights_\": true,\n", " \"type_vocab_size\": 2,\n", " \"vocab_size\": 28996\n", "}\n", "\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "INFO:filelock:Lock 139884094601256 acquired on /root/.cache/torch/transformers/185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d.lock\n", "INFO:transformers.file_utils:https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp8t3mu3iu\n" ], "name": "stderr" }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3333a69504ea4ea98f20ccf31b54a96b", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "INFO:transformers.file_utils:storing https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin in cache at /root/.cache/torch/transformers/185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d\n", "INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d\n", "INFO:filelock:Lock 139884094601256 released on /root/.cache/torch/transformers/185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d.lock\n", "INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/distilbert-base-cased-pytorch_model.bin from cache at /root/.cache/torch/transformers/185eb053d63bc5c2d6994e4b2a8e5eb59f31af90db9c5fae5e38c32a986462cb.857b7d17ad0bfaa2eec50caf481575bab1073303fef16bd5f29bc5248b2b8c7d\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "INFO:transformers.modeling_utils:Weights of BertForQuestionAnswering not initialized from pretrained model: ['embeddings.word_embeddings.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.LayerNorm.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.1.attention.self.key.bias', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.1.attention.output.LayerNorm.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.2.attention.self.query.bias', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.2.attention.self.value.bias', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.2.attention.output.dense.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.3.output.dense.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.4.attention.self.key.weight', 'encoder.layer.4.attention.self.key.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.4.attention.self.value.bias', 'encoder.layer.4.attention.output.dense.weight', 'encoder.layer.4.attention.output.dense.bias', 'encoder.layer.4.attention.output.LayerNorm.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.4.output.dense.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.4.output.LayerNorm.weight', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.5.attention.self.query.weight', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.5.attention.self.key.weight', 'encoder.layer.5.attention.self.key.bias', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.5.attention.output.dense.bias', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.5.attention.output.LayerNorm.bias', 'encoder.layer.5.intermediate.dense.weight', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.5.output.dense.weight', 'encoder.layer.5.output.dense.bias', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.6.attention.self.query.weight', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.6.attention.self.key.bias', 'encoder.layer.6.attention.self.value.weight', 'encoder.layer.6.attention.self.value.bias', 'encoder.layer.6.attention.output.dense.weight', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.6.attention.output.LayerNorm.weight', 'encoder.layer.6.attention.output.LayerNorm.bias', 'encoder.layer.6.intermediate.dense.weight', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.output.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.6.output.LayerNorm.bias', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.7.attention.self.query.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.7.attention.self.key.bias', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.7.attention.self.value.bias', 'encoder.layer.7.attention.output.dense.weight', 'encoder.layer.7.attention.output.dense.bias', 'encoder.layer.7.attention.output.LayerNorm.weight', 'encoder.layer.7.attention.output.LayerNorm.bias', 'encoder.layer.7.intermediate.dense.weight', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.7.output.dense.bias', 'encoder.layer.7.output.LayerNorm.weight', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.8.attention.self.query.bias', 'encoder.layer.8.attention.self.key.weight', 'encoder.layer.8.attention.self.key.bias', 'encoder.layer.8.attention.self.value.weight', 'encoder.layer.8.attention.self.value.bias', 'encoder.layer.8.attention.output.dense.weight', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.8.attention.output.LayerNorm.bias', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.8.output.dense.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.9.attention.self.query.bias', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.9.attention.self.value.weight', 'encoder.layer.9.attention.self.value.bias', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.9.attention.output.dense.bias', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.9.output.dense.weight', 'encoder.layer.9.output.dense.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.10.attention.self.query.weight', 'encoder.layer.10.attention.self.query.bias', 'encoder.layer.10.attention.self.key.weight', 'encoder.layer.10.attention.self.key.bias', 'encoder.layer.10.attention.self.value.weight', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.10.attention.output.dense.weight', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.10.attention.output.LayerNorm.weight', 'encoder.layer.10.attention.output.LayerNorm.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.11.attention.self.query.weight', 'encoder.layer.11.attention.self.query.bias', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.11.attention.self.key.bias', 'encoder.layer.11.attention.self.value.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.11.attention.output.LayerNorm.weight', 'encoder.layer.11.attention.output.LayerNorm.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.output.dense.weight', 'encoder.layer.11.output.dense.bias', 'encoder.layer.11.output.LayerNorm.weight', 'encoder.layer.11.output.LayerNorm.bias', 'pooler.dense.weight', 'pooler.dense.bias', 'qa_outputs.bias', 'qa_outputs.weight']\n", "INFO:transformers.modeling_utils:Weights from pretrained model not used in BertForQuestionAnswering: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.0.output_layer_norm.weight', 'distilbert.transformer.layer.0.output_layer_norm.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.v_lin.weight', 'distilbert.transformer.layer.1.attention.v_lin.bias', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.bias', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.bias', 'distilbert.transformer.layer.1.ffn.lin1.weight', 'distilbert.transformer.layer.1.ffn.lin1.bias', 'distilbert.transformer.layer.1.ffn.lin2.weight', 'distilbert.transformer.layer.1.ffn.lin2.bias', 'distilbert.transformer.layer.1.output_layer_norm.weight', 'distilbert.transformer.layer.1.output_layer_norm.bias', 'distilbert.transformer.layer.2.attention.q_lin.weight', 'distilbert.transformer.layer.2.attention.q_lin.bias', 'distilbert.transformer.layer.2.attention.k_lin.weight', 'distilbert.transformer.layer.2.attention.k_lin.bias', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.2.attention.v_lin.bias', 'distilbert.transformer.layer.2.attention.out_lin.weight', 'distilbert.transformer.layer.2.attention.out_lin.bias', 'distilbert.transformer.layer.2.sa_layer_norm.weight', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.transformer.layer.2.ffn.lin1.weight', 'distilbert.transformer.layer.2.ffn.lin1.bias', 'distilbert.transformer.layer.2.ffn.lin2.weight', 'distilbert.transformer.layer.2.ffn.lin2.bias', 'distilbert.transformer.layer.2.output_layer_norm.weight', 'distilbert.transformer.layer.2.output_layer_norm.bias', 'distilbert.transformer.layer.3.attention.q_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.transformer.layer.3.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.k_lin.bias', 'distilbert.transformer.layer.3.attention.v_lin.weight', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.3.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.3.sa_layer_norm.weight', 'distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.3.ffn.lin1.weight', 'distilbert.transformer.layer.3.ffn.lin1.bias', 'distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.3.output_layer_norm.bias', 'distilbert.transformer.layer.4.attention.q_lin.weight', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.4.attention.v_lin.bias', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.4.attention.out_lin.bias', 'distilbert.transformer.layer.4.sa_layer_norm.weight', 'distilbert.transformer.layer.4.sa_layer_norm.bias', 'distilbert.transformer.layer.4.ffn.lin1.weight', 'distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.4.ffn.lin2.bias', 'distilbert.transformer.layer.4.output_layer_norm.weight', 'distilbert.transformer.layer.4.output_layer_norm.bias', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer.5.attention.q_lin.bias', 'distilbert.transformer.layer.5.attention.k_lin.weight', 'distilbert.transformer.layer.5.attention.k_lin.bias', 'distilbert.transformer.layer.5.attention.v_lin.weight', 'distilbert.transformer.layer.5.attention.v_lin.bias', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.bias', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.5.sa_layer_norm.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.5.ffn.lin1.bias', 'distilbert.transformer.layer.5.ffn.lin2.weight', 'distilbert.transformer.layer.5.ffn.lin2.bias', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.transformer.layer.5.output_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "biqDH9vpvSVz", "colab_type": "code", "outputId": "5587703a-cbab-44ea-bc83-b6f5b322f5e5", "colab": { "base_uri": "https://localhost:8080/", "height": 102 } }, "source": [ "# Now let's train our model\n", "\n", "model.train()\n", "for i, batch in enumerate(dataloader):\n", " outputs = model(**batch)\n", " loss = outputs[0]\n", " loss.backward()\n", " optimizer.step()\n", " model.zero_grad()\n", " print(f'Step {i} - loss: {loss:.3}')\n", " if i > 3:\n", " break" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Step 0 - loss: 6.42\n", "Step 1 - loss: 5.64\n", "Step 2 - loss: 5.09\n", "Step 3 - loss: 5.59\n", "Step 4 - loss: 4.81\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "kxZQ9Ms_vSV1", "colab_type": "text" }, "source": [ "# Wrapping this all up (Tensorflow)\n", "\n", "Let's wrap this all up with the full code to load and prepare SQuAD for training a Tensorflow model (works only from the version 2.2.0)" ] }, { "cell_type": "code", "metadata": { "id": "ZE8VSTYovSV2", "colab_type": "code", "outputId": "4f3c33f0-deb1-48d4-d778-c3d80172b22f", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "import tensorflow as tf\n", "import nlp\n", "from transformers import BertTokenizerFast\n", "\n", "# Load our training dataset and tokenizer\n", "train_tf_dataset = nlp.load_dataset('squad', split=\"train\")\n", "tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')\n", "\n", "# Tokenize our training dataset\n", "# The only one diff here is that start_positions and end_positions\n", "# must be single dim list => [[23], [45] ...]\n", "# instead of => [23, 45 ...]\n", "def convert_to_tf_features(example_batch):\n", " # Tokenize contexts and questions (as pairs of inputs)\n", " input_pairs = list(zip(example_batch['context'], example_batch['question']))\n", " encodings = tokenizer.batch_encode_plus(input_pairs, pad_to_max_length=True, max_length=tokenizer.max_len)\n", "\n", " # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.\n", " start_positions, end_positions = [], []\n", " for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):\n", " start_idx, end_idx = get_correct_alignement(context, answer)\n", " start_positions.append([encodings.char_to_token(i, start_idx)])\n", " end_positions.append([encodings.char_to_token(i, end_idx-1)])\n", " \n", " if start_positions and end_positions:\n", " encodings.update({'start_positions': start_positions,\n", " 'end_positions': end_positions})\n", " return encodings\n", "\n", "train_tf_dataset = train_tf_dataset.map(convert_to_tf_features, batched=True)\n", "\n", "def remove_none_values(example):\n", " return not None in example[\"start_positions\"] or not None in example[\"end_positions\"]\n", "\n", "train_tf_dataset = train_tf_dataset.filter(remove_none_values, load_from_cache_file=False)\n", "columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']\n", "train_tf_dataset.set_format(type='tensorflow', columns=columns)\n", "features = {x: train_tf_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.max_len]) for x in columns[:3]} \n", "labels = {\"output_1\": train_tf_dataset[\"start_positions\"].to_tensor(default_value=0, shape=[None, 1])}\n", "labels[\"output_2\"] = train_tf_dataset[\"end_positions\"].to_tensor(default_value=0, shape=[None, 1])\n", "tfdataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(8)" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "text": [ "100%|██████████| 88/88 [00:38<00:00, 2.30it/s]\n", "100%|██████████| 88/88 [00:38<00:00, 2.26it/s]\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "y0dfw8K8vSV4", "colab_type": "code", "colab": {} }, "source": [ "# Let's load a pretrained TF2 Bert model and a simple optimizer\n", "from transformers import TFBertForQuestionAnswering\n", "\n", "model = TFBertForQuestionAnswering.from_pretrained(\"bert-base-cased\")\n", "loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE, from_logits=True)\n", "opt = tf.keras.optimizers.Adam(learning_rate=3e-5)\n", "model.compile(optimizer=opt,\n", " loss={'output_1': loss_fn, 'output_2': loss_fn},\n", " loss_weights={'output_1': 1., 'output_2': 1.},\n", " metrics=['accuracy'])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "TcYtiykmvSV6", "colab_type": "code", "outputId": "4b755c1b-8e61-43ab-ffc6-d3e82ed4174d", "colab": { "base_uri": "https://localhost:8080/", "height": 207 } }, "source": [ "# Now let's train our model\n", "\n", "model.fit(tfdataset, epochs=1, steps_per_epoch=3)" ], "execution_count": 17, "outputs": [ { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_question_answering_1/bert/pooler/dense/kernel:0', 'tf_bert_for_question_answering_1/bert/pooler/dense/bias:0'] when minimizing the loss.\n" ], "name": "stderr" }, { "output_type": "stream", "text": [ "3/3 [==============================] - 97s 32s/step - loss: 12.2385 - output_1_loss: 6.0742 - output_2_loss: 6.1642 - output_1_accuracy: 0.0417 - output_2_accuracy: 0.0000e+00\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "markdown", "metadata": { "id": "eREDXWP6vSV8", "colab_type": "text" }, "source": [ "# Metrics API\n", "\n", "`nlp` also provides easy access and sharing of metrics.\n", "\n", "This aspect of the library is still experimental and the API may still evolve more than the datasets API.\n", "\n", "Like datasets, metrics are added as small scripts wrapping common metrics in a common API.\n", "\n", "There are several reason you may want to use metrics with `nlp` and in particular:\n", "\n", "- metrics for specific datasets like GLUE or SQuAD are provided out-of-the-box in a simple, convenient and consistant way integrated with the dataset,\n", "- metrics in `nlp` leverage the powerful backend to provide smart features out-of-the-box like support for distributed evaluation in PyTorch" ] }, { "cell_type": "markdown", "metadata": { "id": "uUoGMMVKvSV8", "colab_type": "text" }, "source": [ "## Using metrics\n", "\n", "Using metrics is pretty simple, they have two main methods: `.compute(predictions, references)` to directly compute the metric and `.add(prediction, reference)` or `.add_batch(predictions, references)` to only store some results if you want to do the evaluation in one go at the end.\n", "\n", "Here is a quick gist of a standard use of metrics (the simplest usage):\n", "```python\n", "import nlp\n", "bleu_metric = nlp.load_metric('bleu')\n", "\n", "# If you only have a single iteration, you can easily compute the score like this\n", "predictions = model(inputs)\n", "score = bleu_metric.compute(predictions, references)\n", "\n", "# If you have a loop, you can \"add\" your predictions and references at each iteration instead of having to save them yourself (the metric object store them efficiently for you)\n", "for batch in dataloader:\n", " model_input, targets = batch\n", " predictions = model(model_inputs)\n", " bleu_metric.add_batch(predictions, targets)\n", "score = bleu_metric.compute() # Compute the score from all the stored predictions/references\n", "```\n", "\n", "Here is a quick gist of a use in a distributed torch setup (should work for any python multi-process setup actually). It's pretty much identical to the second example above:\n", "```python\n", "import nlp\n", "# You need to give the total number of parallel python processes (num_process) and the id of each process (process_id)\n", "bleu_metric = nlp.load_metric('bleu', process_id=torch.distributed.get_rank(),b num_process=torch.distributed.get_world_size())\n", "\n", "for batch in dataloader:\n", " model_input, targets = batch\n", " predictions = model(model_inputs)\n", " bleu_metric.add_batch(predictions, targets)\n", "score = bleu_metric.compute() # Compute the score on the first node by default (can be set to compute on each node as well)\n", "```" ] }, { "cell_type": "markdown", "metadata": { "id": "ySL-vDadvSV8", "colab_type": "text" }, "source": [ "Example with a NER metric: `seqeval`" ] }, { "cell_type": "code", "metadata": { "id": "f4uZym7MvSV9", "colab_type": "code", "colab": {} }, "source": [ "ner_metric = nlp.load_metric('seqeval')\n", "references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]\n", "predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]\n", "ner_metric.compute(predictions, references)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ctY6AIAilLdH", "colab_type": "text" }, "source": [ "# Adding a new dataset or a new metric\n", "\n", "They are two ways to add new datasets and metrics in `nlp`:\n", "\n", "- datasets can be added with a Pull-Request adding a script in the `datasets` folder of the [`nlp` repository](https://github.com/huggingface/nlp)\n", "\n", "=> once the PR is merged, the dataset can be instantiate by it's folder name e.g. `nlp.load_dataset('squad')`. If you want HuggingFace to host the data as well you will need to ask the HuggingFace team to upload the data.\n", "\n", "- datasets can also be added with a direct upload using `nlp` CLI as a user or organization (like for models in `transformers`). In this case the dataset will be accessible under the gien user/organization name, e.g. `nlp.load_dataset('thomwolf/squad')`. In this case you can upload the data yourself at the same time and in the same folder.\n", "\n", "We will add a full tutorial on how to add and upload datasets soon." ] }, { "cell_type": "code", "metadata": { "id": "ypLjbtGrljk8", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }