{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "c4_200m dataprep.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "4f7c7d14ab5742f6bce3d9168a9c23ba": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_5b3c28a12f8e45d9acdc76ef5e821d6d", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_d1f3e44a980f4b379c86b4d6a27ccde2", "IPY_MODEL_6a25bb6000b54a1ba3fdd2652a58b264", "IPY_MODEL_987ce78dbb7e4feea4faa42a0dcb7ad3" ] } }, "5b3c28a12f8e45d9acdc76ef5e821d6d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "d1f3e44a980f4b379c86b4d6a27ccde2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_0cbae967f211481c841e10e7a18a4055", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_b16e804c85c04bc4b75eba6c991dcaf9" } }, "6a25bb6000b54a1ba3fdd2652a58b264": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_045e8e52bcfa42b8a37b3b22e648e159", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 1199, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1199, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_9b47ca47664f473681022ffbe5dd9743" } }, "987ce78dbb7e4feea4faa42a0dcb7ad3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_ac7a9b82a98740bcad813dc6109d6a99", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 1.17k/1.17k [00:00<00:00, 32.3kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_d02275c3578a43e68d61da1f72f7f8f1" } }, "0cbae967f211481c841e10e7a18a4055": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "b16e804c85c04bc4b75eba6c991dcaf9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "045e8e52bcfa42b8a37b3b22e648e159": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "9b47ca47664f473681022ffbe5dd9743": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ac7a9b82a98740bcad813dc6109d6a99": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "d02275c3578a43e68d61da1f72f7f8f1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "aa6d4261ab10456f944e629b470e6d3d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_346dbb05c00d4e338c02774833590dc6", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_420b0096cfec44db92bbec18abde5bce", "IPY_MODEL_a1b32715632b4675be53bc28973fd95c", "IPY_MODEL_465070a195dd4aaa8bf8d7b2f7ac4919" ] } }, "346dbb05c00d4e338c02774833590dc6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "420b0096cfec44db92bbec18abde5bce": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_8348bdbfec4e4e798fb7515dfca6c340", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_e14db1423b7641b391f3ee8e8fb2d359" } }, "a1b32715632b4675be53bc28973fd95c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_4590a7b2a6104e2d84e9c232af20785d", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 791656, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 791656, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_5c7bb048f4d84783811f0b6662b10596" } }, "465070a195dd4aaa8bf8d7b2f7ac4919": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_154a769c7cfe4a97837095f844580d0b", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 773k/773k [00:00<00:00, 1.78MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_b8a29074bc75465586850338763830f4" } }, "8348bdbfec4e4e798fb7515dfca6c340": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "e14db1423b7641b391f3ee8e8fb2d359": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "4590a7b2a6104e2d84e9c232af20785d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "5c7bb048f4d84783811f0b6662b10596": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "154a769c7cfe4a97837095f844580d0b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "b8a29074bc75465586850338763830f4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "5ad82ac10a20429bbbf727347ac8df4d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_e7f714e48ef547089e0aa9d913883d76", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_7306b158bac44716b0ba5fb633c2ab5e", "IPY_MODEL_da2e1584fc084f3d9cb88d5d36e4c464", "IPY_MODEL_0f159d2bd88044f1ac625d7f5aac843f" ] } }, "e7f714e48ef547089e0aa9d913883d76": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "7306b158bac44716b0ba5fb633c2ab5e": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_aa64dff4968840038670fd6e536132ac", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_f0a827e1f0884e5383bc7a9767acb18c" } }, "da2e1584fc084f3d9cb88d5d36e4c464": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_49348f0ae017403c87b3079765786f87", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 1389353, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1389353, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_f54ccb6645844050a983b5bfc835fcb5" } }, "0f159d2bd88044f1ac625d7f5aac843f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_5a9402cce2854c668ec976b15619c514", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 1.32M/1.32M [00:00<00:00, 3.90MB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_9b3b28d984b7402e823bc52bb54c073b" } }, "aa64dff4968840038670fd6e536132ac": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "f0a827e1f0884e5383bc7a9767acb18c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "49348f0ae017403c87b3079765786f87": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "f54ccb6645844050a983b5bfc835fcb5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "5a9402cce2854c668ec976b15619c514": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "9b3b28d984b7402e823bc52bb54c073b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MO2O2Y3qycm9", "outputId": "8067972c-c60a-49ca-a9f0-48aa723dc7b4" }, "source": [ "!pip install -q apache_beam\n", "!pip install -q tensorflow-datasets tfds-nightly" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[K |████████████████████████████████| 9.8 MB 5.1 MB/s \n", "\u001b[K |████████████████████████████████| 247 kB 71.2 MB/s \n", "\u001b[K |████████████████████████████████| 151 kB 74.3 MB/s \n", "\u001b[K |████████████████████████████████| 829 kB 31.7 MB/s \n", "\u001b[K |████████████████████████████████| 62 kB 958 kB/s \n", "\u001b[K |████████████████████████████████| 2.3 MB 65.8 MB/s \n", "\u001b[?25h Building wheel for avro-python3 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for future (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "multiprocess 0.70.12.2 requires dill>=0.3.4, but you have dill 0.3.1.1 which is incompatible.\n", "google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.26.0 which is incompatible.\n", "datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001b[0m\n", "\u001b[K |████████████████████████████████| 4.1 MB 5.1 MB/s \n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "metadata": { "id": "_x9-rjF9ygED" }, "source": [ "import tensorflow_datasets as tfds" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kGBrOAl1pTbv" }, "source": [ "!mkdir ~/.kaggle\n", "!pip install -q kaggle" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "6WcnyCcQpWsY" }, "source": [ "kaggle_username =''\n", "kaggle_key = ''\n", "!echo '{\"username\":kaggle_username,\"key\":kaggle_key}' > ~/.kaggle/kaggle.json" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WYB8mOUopgOw", "outputId": "f34867a2-0634-41ab-9f2c-60a843e39c2a" }, "source": [ "!kaggle datasets download -d a0155991rliwei/c4-200m -p /content" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'\n", "Downloading c4-200m.zip to /content\n", "100% 19.3G/19.3G [03:35<00:00, 79.4MB/s]\n", "100% 19.3G/19.3G [03:35<00:00, 96.1MB/s]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "8ujxx6RGqRtO" }, "source": [ "### Load the dataset and shard it" ] }, { "cell_type": "code", "source": [ "!pip install sentencepiece" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i9FuI12vrOi2", "outputId": "f1a51a6e-dafe-4ae5-f154-bf4b74496596" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting sentencepiece\n", " Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", "\u001b[K |████████████████████████████████| 1.2 MB 4.9 MB/s \n", "\u001b[?25hInstalling collected packages: sentencepiece\n", "Successfully installed sentencepiece-0.1.96\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install -q transformers" ], "metadata": { "id": "q0N3KiNprJsj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from transformers import (\n", " AutoConfig,\n", " AutoTokenizer,\n", " AutoModelForSequenceClassification,\n", ")\n", "import pandas as pd" ], "metadata": { "id": "ocx2nCbvqPcZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model_name = 't5-base'\n", "# Initialise tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 113, "referenced_widgets": [ "4f7c7d14ab5742f6bce3d9168a9c23ba", "5b3c28a12f8e45d9acdc76ef5e821d6d", "d1f3e44a980f4b379c86b4d6a27ccde2", "6a25bb6000b54a1ba3fdd2652a58b264", "987ce78dbb7e4feea4faa42a0dcb7ad3", "0cbae967f211481c841e10e7a18a4055", "b16e804c85c04bc4b75eba6c991dcaf9", "045e8e52bcfa42b8a37b3b22e648e159", "9b47ca47664f473681022ffbe5dd9743", "ac7a9b82a98740bcad813dc6109d6a99", "d02275c3578a43e68d61da1f72f7f8f1", "aa6d4261ab10456f944e629b470e6d3d", "346dbb05c00d4e338c02774833590dc6", "420b0096cfec44db92bbec18abde5bce", "a1b32715632b4675be53bc28973fd95c", "465070a195dd4aaa8bf8d7b2f7ac4919", "8348bdbfec4e4e798fb7515dfca6c340", "e14db1423b7641b391f3ee8e8fb2d359", "4590a7b2a6104e2d84e9c232af20785d", "5c7bb048f4d84783811f0b6662b10596", "154a769c7cfe4a97837095f844580d0b", "b8a29074bc75465586850338763830f4", "5ad82ac10a20429bbbf727347ac8df4d", "e7f714e48ef547089e0aa9d913883d76", "7306b158bac44716b0ba5fb633c2ab5e", "da2e1584fc084f3d9cb88d5d36e4c464", "0f159d2bd88044f1ac625d7f5aac843f", "aa64dff4968840038670fd6e536132ac", "f0a827e1f0884e5383bc7a9767acb18c", "49348f0ae017403c87b3079765786f87", "f54ccb6645844050a983b5bfc835fcb5", "5a9402cce2854c668ec976b15619c514", "9b3b28d984b7402e823bc52bb54c073b" ] }, "id": "reLEvdwBqeM1", "outputId": "a2704880-a779-41f3-aacb-68d70d91d4db" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4f7c7d14ab5742f6bce3d9168a9c23ba", "version_minor": 0, "version_major": 2 }, "text/plain": [ "Downloading: 0%| | 0.00/1.17k [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
inputoutput
0b\"Can be empenty'' for more damage, but not ne...b'Can be empathy for more damage, but not need...
1b'Miguelx completed Pollster Badge.'b'Miguelx completed the Pollster Badge.'
2b'This classic three day itinerary is take you...b'This classic three-day itinerary takes you t...
3b'Kimbrelle shares an inspirational story wher...b'Kimbrelle shares an inspirational story wher...
4b'Variation: The utility players get a guideli...b'Variation: The utility players get a guideli...
\n", "" ], "text/plain": [ " input output\n", "0 b\"Can be empenty'' for more damage, but not ne... b'Can be empathy for more damage, but not need...\n", "1 b'Miguelx completed Pollster Badge.' b'Miguelx completed the Pollster Badge.'\n", "2 b'This classic three day itinerary is take you... b'This classic three-day itinerary takes you t...\n", "3 b'Kimbrelle shares an inspirational story wher... b'Kimbrelle shares an inspirational story wher...\n", "4 b'Variation: The utility players get a guideli... b'Variation: The utility players get a guideli..." ] }, "metadata": {}, "execution_count": 29 } ] }, { "cell_type": "code", "source": [ "train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')\n", "train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')\n", "train_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "JX0blLmbzg-G", "outputId": "5bca1ad7-709d-461a-d463-80a0f9320bf3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
inputoutput
0The steps below describe how to remove data fo...The steps below describe how to remove data fo...
1When I wake up it\\'s usually comes out dreamsI...When I wake up it\\'s usually dreams I\\'m think...
2One of the cardinal factors to be considered t...One of the cardinal factors to consider when t...
3Answers » Regions » Is in Nagorno-Karabakt reg...Answers » Regions » Is Nagorno-Karabakh region...
4Flaneuring in fun at maple creek SK!Flaneuring Fun in Maple Creek SK!
\n", "
" ], "text/plain": [ " input output\n", "0 The steps below describe how to remove data fo... The steps below describe how to remove data fo...\n", "1 When I wake up it\\'s usually comes out dreamsI... When I wake up it\\'s usually dreams I\\'m think...\n", "2 One of the cardinal factors to be considered t... One of the cardinal factors to consider when t...\n", "3 Answers » Regions » Is in Nagorno-Karabakt reg... Answers » Regions » Is Nagorno-Karabakh region...\n", "4 Flaneuring in fun at maple creek SK! Flaneuring Fun in Maple Creek SK!" ] }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "code", "source": [ "train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_550k.csv', index=False)" ], "metadata": { "id": "ZL3vXy6HyhzC" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Also Create a 1MM dataset" ], "metadata": { "id": "O0RDvT9yz8G1" } }, { "cell_type": "code", "metadata": { "id": "vGIOWlz7u355", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8630dbd4-5d11-4574-b722-eb73cef1dd91" }, "source": [ "train_data = c4_builder.as_dataset(split='train', shuffle_files=True)\n", "train_df = tfds.as_dataframe(train_data.take(550000))\n", "train_df.shape" ], "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "(1000000, 2)" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')\n", "train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')\n", "train_df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "id": "DvEAQOz60DBz", "outputId": "6ed9c218-1175-4b62-9f6e-0195d11aafc1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
inputoutput
0Medell he, Ohio W. Shannon Kansas,R. C.Medell, Ohio; W. Shannon. Kansas; R. C.
1quarter of 1999 $ 25 million was repaid under ...quarter of 1999, $25 million was repaid under ...
2It used as service center by the Block office ...It can be used as a service center by the Bloc...
3Tom offered two this time, one of old restaura...Tom offered two this time, one of old restaura...
4You can see 'Spring beauties' at The Botanical...You can see Spring Beauties at The Botanical G...
\n", "
" ], "text/plain": [ " input output\n", "0 Medell he, Ohio W. Shannon Kansas,R. C. Medell, Ohio; W. Shannon. Kansas; R. C.\n", "1 quarter of 1999 $ 25 million was repaid under ... quarter of 1999, $25 million was repaid under ...\n", "2 It used as service center by the Block office ... It can be used as a service center by the Bloc...\n", "3 Tom offered two this time, one of old restaura... Tom offered two this time, one of old restaura...\n", "4 You can see 'Spring beauties' at The Botanical... You can see Spring Beauties at The Botanical G..." ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [ "train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_1M.csv', index=False)" ], "metadata": { "id": "4IVLTBFI0GoH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "" ], "metadata": { "id": "5cglDzbf0JV3" }, "execution_count": null, "outputs": [] } ] }