{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "c4_200m dataprep.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"4f7c7d14ab5742f6bce3d9168a9c23ba": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_5b3c28a12f8e45d9acdc76ef5e821d6d",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_d1f3e44a980f4b379c86b4d6a27ccde2",
"IPY_MODEL_6a25bb6000b54a1ba3fdd2652a58b264",
"IPY_MODEL_987ce78dbb7e4feea4faa42a0dcb7ad3"
]
}
},
"5b3c28a12f8e45d9acdc76ef5e821d6d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d1f3e44a980f4b379c86b4d6a27ccde2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_0cbae967f211481c841e10e7a18a4055",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b16e804c85c04bc4b75eba6c991dcaf9"
}
},
"6a25bb6000b54a1ba3fdd2652a58b264": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_045e8e52bcfa42b8a37b3b22e648e159",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1199,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1199,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9b47ca47664f473681022ffbe5dd9743"
}
},
"987ce78dbb7e4feea4faa42a0dcb7ad3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_ac7a9b82a98740bcad813dc6109d6a99",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.17k/1.17k [00:00<00:00, 32.3kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d02275c3578a43e68d61da1f72f7f8f1"
}
},
"0cbae967f211481c841e10e7a18a4055": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b16e804c85c04bc4b75eba6c991dcaf9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"045e8e52bcfa42b8a37b3b22e648e159": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"9b47ca47664f473681022ffbe5dd9743": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ac7a9b82a98740bcad813dc6109d6a99": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d02275c3578a43e68d61da1f72f7f8f1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"aa6d4261ab10456f944e629b470e6d3d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_346dbb05c00d4e338c02774833590dc6",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_420b0096cfec44db92bbec18abde5bce",
"IPY_MODEL_a1b32715632b4675be53bc28973fd95c",
"IPY_MODEL_465070a195dd4aaa8bf8d7b2f7ac4919"
]
}
},
"346dbb05c00d4e338c02774833590dc6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"420b0096cfec44db92bbec18abde5bce": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_8348bdbfec4e4e798fb7515dfca6c340",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_e14db1423b7641b391f3ee8e8fb2d359"
}
},
"a1b32715632b4675be53bc28973fd95c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_4590a7b2a6104e2d84e9c232af20785d",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 791656,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 791656,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_5c7bb048f4d84783811f0b6662b10596"
}
},
"465070a195dd4aaa8bf8d7b2f7ac4919": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_154a769c7cfe4a97837095f844580d0b",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 773k/773k [00:00<00:00, 1.78MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b8a29074bc75465586850338763830f4"
}
},
"8348bdbfec4e4e798fb7515dfca6c340": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"e14db1423b7641b391f3ee8e8fb2d359": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"4590a7b2a6104e2d84e9c232af20785d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"5c7bb048f4d84783811f0b6662b10596": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"154a769c7cfe4a97837095f844580d0b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b8a29074bc75465586850338763830f4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5ad82ac10a20429bbbf727347ac8df4d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_e7f714e48ef547089e0aa9d913883d76",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_7306b158bac44716b0ba5fb633c2ab5e",
"IPY_MODEL_da2e1584fc084f3d9cb88d5d36e4c464",
"IPY_MODEL_0f159d2bd88044f1ac625d7f5aac843f"
]
}
},
"e7f714e48ef547089e0aa9d913883d76": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7306b158bac44716b0ba5fb633c2ab5e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_aa64dff4968840038670fd6e536132ac",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_f0a827e1f0884e5383bc7a9767acb18c"
}
},
"da2e1584fc084f3d9cb88d5d36e4c464": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_49348f0ae017403c87b3079765786f87",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1389353,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1389353,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_f54ccb6645844050a983b5bfc835fcb5"
}
},
"0f159d2bd88044f1ac625d7f5aac843f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5a9402cce2854c668ec976b15619c514",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.32M/1.32M [00:00<00:00, 3.90MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9b3b28d984b7402e823bc52bb54c073b"
}
},
"aa64dff4968840038670fd6e536132ac": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"f0a827e1f0884e5383bc7a9767acb18c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"49348f0ae017403c87b3079765786f87": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"f54ccb6645844050a983b5bfc835fcb5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5a9402cce2854c668ec976b15619c514": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"9b3b28d984b7402e823bc52bb54c073b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MO2O2Y3qycm9",
"outputId": "8067972c-c60a-49ca-a9f0-48aa723dc7b4"
},
"source": [
"!pip install -q apache_beam\n",
"!pip install -q tensorflow-datasets tfds-nightly"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[K |████████████████████████████████| 9.8 MB 5.1 MB/s \n",
"\u001b[K |████████████████████████████████| 247 kB 71.2 MB/s \n",
"\u001b[K |████████████████████████████████| 151 kB 74.3 MB/s \n",
"\u001b[K |████████████████████████████████| 829 kB 31.7 MB/s \n",
"\u001b[K |████████████████████████████████| 62 kB 958 kB/s \n",
"\u001b[K |████████████████████████████████| 2.3 MB 65.8 MB/s \n",
"\u001b[?25h Building wheel for avro-python3 (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for future (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"multiprocess 0.70.12.2 requires dill>=0.3.4, but you have dill 0.3.1.1 which is incompatible.\n",
"google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.26.0 which is incompatible.\n",
"datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.\u001b[0m\n",
"\u001b[K |████████████████████████████████| 4.1 MB 5.1 MB/s \n",
"\u001b[?25h"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "_x9-rjF9ygED"
},
"source": [
"import tensorflow_datasets as tfds"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "kGBrOAl1pTbv"
},
"source": [
"!mkdir ~/.kaggle\n",
"!pip install -q kaggle"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "6WcnyCcQpWsY"
},
"source": [
"kaggle_username =''\n",
"kaggle_key = ''\n",
"!echo '{\"username\":kaggle_username,\"key\":kaggle_key}' > ~/.kaggle/kaggle.json"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WYB8mOUopgOw",
"outputId": "f34867a2-0634-41ab-9f2c-60a843e39c2a"
},
"source": [
"!kaggle datasets download -d a0155991rliwei/c4-200m -p /content"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /root/.kaggle/kaggle.json'\n",
"Downloading c4-200m.zip to /content\n",
"100% 19.3G/19.3G [03:35<00:00, 79.4MB/s]\n",
"100% 19.3G/19.3G [03:35<00:00, 96.1MB/s]\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8ujxx6RGqRtO"
},
"source": [
"### Load the dataset and shard it"
]
},
{
"cell_type": "code",
"source": [
"!pip install sentencepiece"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "i9FuI12vrOi2",
"outputId": "f1a51a6e-dafe-4ae5-f154-bf4b74496596"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting sentencepiece\n",
" Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
"\u001b[K |████████████████████████████████| 1.2 MB 4.9 MB/s \n",
"\u001b[?25hInstalling collected packages: sentencepiece\n",
"Successfully installed sentencepiece-0.1.96\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install -q transformers"
],
"metadata": {
"id": "q0N3KiNprJsj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import (\n",
" AutoConfig,\n",
" AutoTokenizer,\n",
" AutoModelForSequenceClassification,\n",
")\n",
"import pandas as pd"
],
"metadata": {
"id": "ocx2nCbvqPcZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model_name = 't5-base'\n",
"# Initialise tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 113,
"referenced_widgets": [
"4f7c7d14ab5742f6bce3d9168a9c23ba",
"5b3c28a12f8e45d9acdc76ef5e821d6d",
"d1f3e44a980f4b379c86b4d6a27ccde2",
"6a25bb6000b54a1ba3fdd2652a58b264",
"987ce78dbb7e4feea4faa42a0dcb7ad3",
"0cbae967f211481c841e10e7a18a4055",
"b16e804c85c04bc4b75eba6c991dcaf9",
"045e8e52bcfa42b8a37b3b22e648e159",
"9b47ca47664f473681022ffbe5dd9743",
"ac7a9b82a98740bcad813dc6109d6a99",
"d02275c3578a43e68d61da1f72f7f8f1",
"aa6d4261ab10456f944e629b470e6d3d",
"346dbb05c00d4e338c02774833590dc6",
"420b0096cfec44db92bbec18abde5bce",
"a1b32715632b4675be53bc28973fd95c",
"465070a195dd4aaa8bf8d7b2f7ac4919",
"8348bdbfec4e4e798fb7515dfca6c340",
"e14db1423b7641b391f3ee8e8fb2d359",
"4590a7b2a6104e2d84e9c232af20785d",
"5c7bb048f4d84783811f0b6662b10596",
"154a769c7cfe4a97837095f844580d0b",
"b8a29074bc75465586850338763830f4",
"5ad82ac10a20429bbbf727347ac8df4d",
"e7f714e48ef547089e0aa9d913883d76",
"7306b158bac44716b0ba5fb633c2ab5e",
"da2e1584fc084f3d9cb88d5d36e4c464",
"0f159d2bd88044f1ac625d7f5aac843f",
"aa64dff4968840038670fd6e536132ac",
"f0a827e1f0884e5383bc7a9767acb18c",
"49348f0ae017403c87b3079765786f87",
"f54ccb6645844050a983b5bfc835fcb5",
"5a9402cce2854c668ec976b15619c514",
"9b3b28d984b7402e823bc52bb54c073b"
]
},
"id": "reLEvdwBqeM1",
"outputId": "a2704880-a779-41f3-aacb-68d70d91d4db"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4f7c7d14ab5742f6bce3d9168a9c23ba",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.17k [00:00, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "aa6d4261ab10456f944e629b470e6d3d",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/773k [00:00, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ad82ac10a20429bbbf727347ac8df4d",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/1.32M [00:00, ?B/s]"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KoXHd-yf1pPn"
},
"source": [
"!unzip -q /content/c4-200m.zip"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "yK9Vz-JKsIlz",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "256d1432-c16b-49d2-c8d4-9c91db4cf902"
},
"source": [
"c4_builder = tfds.core.builder_from_directory('/content/c4200m/1.0.0')\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:absl:Load dataset info from /content/c4200m/1.0.0\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Fbs-cabXvaqe",
"outputId": "c4b5cf72-283c-4c1f-967b-923340e0e558"
},
"source": [
"# Metadata are avalailable as usual\n",
"num_train_examples = c4_builder.info.splits['train'].num_examples\n",
"print(num_train_examples)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"183894319\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"c4_builder.download_and_prepare()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qYiuD7MbuB4W",
"outputId": "e581020b-491e-4602-e02d-1420f99e3cf7"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:absl:Reusing dataset c4200m (/content/c4200m/1.0.0)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"train_data = c4_builder.as_dataset(split='train', shuffle_files=True)\n",
"train_df = tfds.as_dataframe(train_data.take(550000))\n",
"train_df.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wW73sHNvuvH6",
"outputId": "12466e84-8604-4893-97c9-8fe77c2aec76"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(550000, 2)"
]
},
"metadata": {},
"execution_count": 30
}
]
},
{
"cell_type": "code",
"source": [
"train_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "l0ioHof4vsny",
"outputId": "10223ec1-13f0-476c-d53b-62f34f5e74ef"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" input | \n",
" output | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" b\"Can be empenty'' for more damage, but not ne... | \n",
" b'Can be empathy for more damage, but not need... | \n",
"
\n",
" \n",
" 1 | \n",
" b'Miguelx completed Pollster Badge.' | \n",
" b'Miguelx completed the Pollster Badge.' | \n",
"
\n",
" \n",
" 2 | \n",
" b'This classic three day itinerary is take you... | \n",
" b'This classic three-day itinerary takes you t... | \n",
"
\n",
" \n",
" 3 | \n",
" b'Kimbrelle shares an inspirational story wher... | \n",
" b'Kimbrelle shares an inspirational story wher... | \n",
"
\n",
" \n",
" 4 | \n",
" b'Variation: The utility players get a guideli... | \n",
" b'Variation: The utility players get a guideli... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" input output\n",
"0 b\"Can be empenty'' for more damage, but not ne... b'Can be empathy for more damage, but not need...\n",
"1 b'Miguelx completed Pollster Badge.' b'Miguelx completed the Pollster Badge.'\n",
"2 b'This classic three day itinerary is take you... b'This classic three-day itinerary takes you t...\n",
"3 b'Kimbrelle shares an inspirational story wher... b'Kimbrelle shares an inspirational story wher...\n",
"4 b'Variation: The utility players get a guideli... b'Variation: The utility players get a guideli..."
]
},
"metadata": {},
"execution_count": 29
}
]
},
{
"cell_type": "code",
"source": [
"train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')\n",
"train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')\n",
"train_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "JX0blLmbzg-G",
"outputId": "5bca1ad7-709d-461a-d463-80a0f9320bf3"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" input | \n",
" output | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" The steps below describe how to remove data fo... | \n",
" The steps below describe how to remove data fo... | \n",
"
\n",
" \n",
" 1 | \n",
" When I wake up it\\'s usually comes out dreamsI... | \n",
" When I wake up it\\'s usually dreams I\\'m think... | \n",
"
\n",
" \n",
" 2 | \n",
" One of the cardinal factors to be considered t... | \n",
" One of the cardinal factors to consider when t... | \n",
"
\n",
" \n",
" 3 | \n",
" Answers » Regions » Is in Nagorno-Karabakt reg... | \n",
" Answers » Regions » Is Nagorno-Karabakh region... | \n",
"
\n",
" \n",
" 4 | \n",
" Flaneuring in fun at maple creek SK! | \n",
" Flaneuring Fun in Maple Creek SK! | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" input output\n",
"0 The steps below describe how to remove data fo... The steps below describe how to remove data fo...\n",
"1 When I wake up it\\'s usually comes out dreamsI... When I wake up it\\'s usually dreams I\\'m think...\n",
"2 One of the cardinal factors to be considered t... One of the cardinal factors to consider when t...\n",
"3 Answers » Regions » Is in Nagorno-Karabakt reg... Answers » Regions » Is Nagorno-Karabakh region...\n",
"4 Flaneuring in fun at maple creek SK! Flaneuring Fun in Maple Creek SK!"
]
},
"metadata": {},
"execution_count": 32
}
]
},
{
"cell_type": "code",
"source": [
"train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_550k.csv', index=False)"
],
"metadata": {
"id": "ZL3vXy6HyhzC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Also Create a 1MM dataset"
],
"metadata": {
"id": "O0RDvT9yz8G1"
}
},
{
"cell_type": "code",
"metadata": {
"id": "vGIOWlz7u355",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8630dbd4-5d11-4574-b722-eb73cef1dd91"
},
"source": [
"train_data = c4_builder.as_dataset(split='train', shuffle_files=True)\n",
"train_df = tfds.as_dataframe(train_data.take(550000))\n",
"train_df.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(1000000, 2)"
]
},
"metadata": {},
"execution_count": 34
}
]
},
{
"cell_type": "code",
"source": [
"train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')\n",
"train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')\n",
"train_df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "DvEAQOz60DBz",
"outputId": "6ed9c218-1175-4b62-9f6e-0195d11aafc1"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" input | \n",
" output | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Medell he, Ohio W. Shannon Kansas,R. C. | \n",
" Medell, Ohio; W. Shannon. Kansas; R. C. | \n",
"
\n",
" \n",
" 1 | \n",
" quarter of 1999 $ 25 million was repaid under ... | \n",
" quarter of 1999, $25 million was repaid under ... | \n",
"
\n",
" \n",
" 2 | \n",
" It used as service center by the Block office ... | \n",
" It can be used as a service center by the Bloc... | \n",
"
\n",
" \n",
" 3 | \n",
" Tom offered two this time, one of old restaura... | \n",
" Tom offered two this time, one of old restaura... | \n",
"
\n",
" \n",
" 4 | \n",
" You can see 'Spring beauties' at The Botanical... | \n",
" You can see Spring Beauties at The Botanical G... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" input output\n",
"0 Medell he, Ohio W. Shannon Kansas,R. C. Medell, Ohio; W. Shannon. Kansas; R. C.\n",
"1 quarter of 1999 $ 25 million was repaid under ... quarter of 1999, $25 million was repaid under ...\n",
"2 It used as service center by the Block office ... It can be used as a service center by the Bloc...\n",
"3 Tom offered two this time, one of old restaura... Tom offered two this time, one of old restaura...\n",
"4 You can see 'Spring beauties' at The Botanical... You can see Spring Beauties at The Botanical G..."
]
},
"metadata": {},
"execution_count": 35
}
]
},
{
"cell_type": "code",
"source": [
"train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_1M.csv', index=False)"
],
"metadata": {
"id": "4IVLTBFI0GoH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
""
],
"metadata": {
"id": "5cglDzbf0JV3"
},
"execution_count": null,
"outputs": []
}
]
}