{"cells":[{"cell_type":"markdown","source":["# Cleaning and preparing an image dataset using fastdup V1.0"],"metadata":{"id":"LCLC7GEmzriP"},"id":"LCLC7GEmzriP"},{"cell_type":"code","execution_count":null,"id":"58a77ca9-6df1-4ac1-b041-fdc85ad59ddb","metadata":{"id":"58a77ca9-6df1-4ac1-b041-fdc85ad59ddb"},"outputs":[],"source":["# download fastdup\n","!pip install pip -U\n","!pip install fastdup\n","!pip install pandas\n","!pip install matplotlib\n","!pip install wurlitzer\n","%load_ext wurlitzer"]},{"cell_type":"code","execution_count":2,"id":"6a49b5eb","metadata":{"id":"6a49b5eb","executionInfo":{"status":"ok","timestamp":1677668109538,"user_tz":-120,"elapsed":2034,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[],"source":["import fastdup\n","import pandas as pd"]},{"cell_type":"markdown","id":"ff4dfa80-d1e4-46d1-ae10-e8715c16bb07","metadata":{"id":"ff4dfa80-d1e4-46d1-ae10-e8715c16bb07"},"source":["# Download food-101 Dataset"]},{"cell_type":"code","execution_count":1,"id":"fddb8af6","metadata":{"id":"fddb8af6","outputId":"562c7636-b0bb-4bdb-c282-b229ffeaaf85","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1677653732682,"user_tz":-120,"elapsed":349477,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["--2023-03-01 06:49:43-- http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz\n","Resolving data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)... 129.132.52.178, 2001:67c:10ec:36c2::178\n","Connecting to data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)|129.132.52.178|:80... connected.\n","HTTP request sent, awaiting response... 302 Found\n","Location: https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz [following]\n","--2023-03-01 06:49:43-- https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz\n","Connecting to data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)|129.132.52.178|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 4996278331 (4.7G) [application/x-gzip]\n","Saving to: ‘food-101.tar.gz’\n","\n","food-101.tar.gz 100%[===================>] 4.65G 18.6MB/s in 4m 22s \n","\n","2023-03-01 06:54:06 (18.2 MB/s) - ‘food-101.tar.gz’ saved [4996278331/4996278331]\n","\n"]}],"source":["!wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz\n","!tar -xf food-101.tar.gz"]},{"cell_type":"markdown","id":"7e2e70a3","metadata":{"tags":[],"id":"7e2e70a3"},"source":["# Run fastdup"]},{"cell_type":"code","execution_count":3,"id":"b0108828-f2ee-435a-8ddb-2344ddcfbd4d","metadata":{"id":"b0108828-f2ee-435a-8ddb-2344ddcfbd4d","executionInfo":{"status":"ok","timestamp":1677668113537,"user_tz":-120,"elapsed":443,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[],"source":["images_dir = 'food-101/images/'\n","work_dir = 'fastdup_food101'"]},{"cell_type":"code","execution_count":4,"id":"2f7632e1","metadata":{"scrolled":true,"tags":[],"id":"2f7632e1","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1677670755290,"user_tz":-120,"elapsed":2639768,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}},"outputId":"f614f0d7-1606-41fa-8a8f-9f935dbe8256"},"outputs":[{"output_type":"stream","name":"stdout","text":["FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.\n","2023-03-01 10:55:15 [INFO] Going to loop over dir food-101/images\n","2023-03-01 10:55:22 [INFO] Found total 101000 images to run on\n","2023-03-01 11:26:06 [INFO] Found total 101000 images to run on\n","2023-03-01 11:36:56 [INFO] 649444) Finished write_index() NN model\n","2023-03-01 11:36:56 [INFO] Stored nn model index file fastdup_food101/nnf.index\n","2023-03-01 11:39:04 [INFO] Total time took 2622423 ms\n","2023-03-01 11:39:04 [INFO] Found a total of 170 fully identical images (d>0.990), which are 0.06 %\n","2023-03-01 11:39:04 [INFO] Found a total of 88 nearly identical images(d>0.980), which are 0.03 %\n","2023-03-01 11:39:04 [INFO] Found a total of 5236 above threshold images (d>0.900), which are 1.73 %\n","2023-03-01 11:39:04 [INFO] Found a total of 10100 outlier images (d<0.050), which are 3.33 %\n","2023-03-01 11:39:04 [INFO] Min distance found 0.379 max distance 1.000\n","2023-03-01 11:39:04 [INFO] Running connected components for ccthreshold 0.960000 \n",".0\n"," ########################################################################################\n","\n","Dataset Analysis Summary: \n","\n"," Dataset contains 101000 images\n"," Valid images are 100.00% (101,000) of the data, invalid are 0.00% (0) of the data\n"," Similarity: 0.23% (228) belong to 3 similarity clusters (components).\n"," 99.77% (100,772) images do not belong to any similarity cluster.\n"," Largest cluster has 6 (0.01%) images.\n"," For a detailed analysis, use `.connected_components()`\n","(similarity threshold used is 0.9, connected component threshold used is 0.96).\n","\n"," Outliers: 5.97% (6,028) of images are possible outliers, and fall in the bottom 5.00% of similarity values.\n"," For a detailed list of outliers, use `.outliers(data=True)`.\n"]}],"source":["fd = fastdup.create(work_dir=work_dir, input_dir=images_dir)\n","fd.run()"]},{"cell_type":"markdown","id":"d4ddd8be","metadata":{"tags":[],"id":"d4ddd8be"},"source":["# Find duplicates"]},{"cell_type":"code","execution_count":5,"id":"8aaa2a06","metadata":{"scrolled":true,"id":"8aaa2a06","colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1tgMsme5DD-GtqDFvk-AqD924HaQ_-V9k"},"executionInfo":{"status":"ok","timestamp":1677671278321,"user_tz":-120,"elapsed":16870,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}},"outputId":"ec92d6b4-a937-4736-afa6-f9f5e8cdc423"},"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}],"source":["# visualize clusters of duplicate images\n","fd.vis.component_gallery(max_width=800)"]},{"cell_type":"code","execution_count":6,"id":"e266e5fe","metadata":{"id":"e266e5fe","outputId":"f839fac3-5412-45ba-df5d-abfe027dd842","colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"status":"ok","timestamp":1677671288349,"user_tz":-120,"elapsed":444,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" fastdup_id component_id sum count mean_distance min_distance \\\n","19859 19859 19811 5.8648 6.0 0.9775 0.9721 \n","21874 21874 21803 6.0000 6.0 1.0000 1.0000 \n","21854 21854 21803 6.0000 6.0 1.0000 1.0000 \n","19862 19862 19811 5.8648 6.0 0.9775 0.9721 \n","19861 19861 19811 5.8648 6.0 0.9775 0.9721 \n","\n"," max_distance img_filename error_code is_valid \n","19859 0.9856 chicken_quesadilla/535057.jpg VALID True \n","21874 1.0000 chocolate_cake/55122.jpg VALID True \n","21854 1.0000 chocolate_cake/49494.jpg VALID True \n","19862 0.9856 chicken_quesadilla/535546.jpg VALID True \n","19861 0.9856 chicken_quesadilla/535532.jpg VALID True "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
fastdup_id
\n","
component_id
\n","
sum
\n","
count
\n","
mean_distance
\n","
min_distance
\n","
max_distance
\n","
img_filename
\n","
error_code
\n","
is_valid
\n","
\n"," \n"," \n","
\n","
19859
\n","
19859
\n","
19811
\n","
5.8648
\n","
6.0
\n","
0.9775
\n","
0.9721
\n","
0.9856
\n","
chicken_quesadilla/535057.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
21874
\n","
21874
\n","
21803
\n","
6.0000
\n","
6.0
\n","
1.0000
\n","
1.0000
\n","
1.0000
\n","
chocolate_cake/55122.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
21854
\n","
21854
\n","
21803
\n","
6.0000
\n","
6.0
\n","
1.0000
\n","
1.0000
\n","
1.0000
\n","
chocolate_cake/49494.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
19862
\n","
19862
\n","
19811
\n","
5.8648
\n","
6.0
\n","
0.9775
\n","
0.9721
\n","
0.9856
\n","
chicken_quesadilla/535546.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
19861
\n","
19861
\n","
19811
\n","
5.8648
\n","
6.0
\n","
0.9775
\n","
0.9721
\n","
0.9856
\n","
chicken_quesadilla/535532.jpg
\n","
VALID
\n","
True
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":6}],"source":["# get a list of clusters with duplicate images using connected components\n","cc_df, _ = fd.connected_components()\n","cc_df[cc_df['count'] > 0.0].sort_values(by=['count'], ascending=False).head()"]},{"cell_type":"markdown","id":"36976bdb-9875-441e-a041-5e1e1304d99b","metadata":{"id":"36976bdb-9875-441e-a041-5e1e1304d99b"},"source":["## Connected Components Grouping"]},{"cell_type":"code","execution_count":7,"id":"168bc163","metadata":{"id":"168bc163","executionInfo":{"status":"ok","timestamp":1677671291446,"user_tz":-120,"elapsed":425,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[],"source":["# a function to group connected components\n","def get_clusters(df, sort_by='count', min_count=2, ascending=False):\n"," # columns to aggregate\n"," agg_dict = {'img_filename': list, 'mean_distance': max, 'count': len}\n","\n"," if 'label' in df.columns:\n"," agg_dict['label'] = list\n"," \n"," # filter by count\n"," df = df[df['count'] >= min_count]\n"," \n"," # group and aggregate columns\n"," grouped_df = df.groupby('component_id').agg(agg_dict)\n"," \n"," # sort\n"," grouped_df = grouped_df.sort_values(by=[sort_by], ascending=ascending)\n"," return grouped_df"]},{"cell_type":"code","execution_count":8,"id":"2d0ebd16","metadata":{"id":"2d0ebd16","outputId":"b3b9f360-6cd8-4bb5-8590-af7c556c88fb","colab":{"base_uri":"https://localhost:8080/","height":175},"executionInfo":{"status":"ok","timestamp":1677671292998,"user_tz":-120,"elapsed":8,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" img_filename \\\n","component_id \n","26465 [crab_cakes/2780617.jpg, crab_cakes/2780621.jpg, crab_cakes/2780623.jpg] \n","21803 [chocolate_cake/49494.jpg, chocolate_cake/51717.jpg, chocolate_cake/55122.jpg] \n","35796 [escargots/637185.jpg, escargots/637187.jpg, escargots/637188.jpg] \n","\n"," mean_distance count \n","component_id \n","26465 0.9759 3 \n","21803 1.0000 3 \n","35796 0.9798 3 "],"text/html":["\n","
\n"," "]},"metadata":{},"execution_count":9}],"source":["# get clusters sorted differently\n","get_clusters(cc_df, sort_by='mean_distance').head(3)"]},{"cell_type":"code","execution_count":10,"id":"ed5b9ad3-e428-45f4-a74b-a697e3711a9a","metadata":{"id":"ed5b9ad3-e428-45f4-a74b-a697e3711a9a","executionInfo":{"status":"ok","timestamp":1677671300533,"user_tz":-120,"elapsed":359,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[],"source":["# It's a good start, but as we can see there are not that many duplicates here, and the data may contain more.\n","# Let's lower the threshold a bit and re-evaluate the duplicates case."]},{"cell_type":"markdown","id":"6b9899fc","metadata":{"id":"6b9899fc"},"source":["## Re-run with lower threshold\n","Now we have more clusters containing more of the images, and we are able to remove highly similar images with higher recall."]},{"cell_type":"code","execution_count":11,"id":"f0337c2d","metadata":{"id":"f0337c2d","outputId":"af0e6cc8-4919-4bce-cfc9-da9658c4cf68","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1677673944455,"user_tz":-120,"elapsed":2641287,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.\n","2023-03-01 11:48:22 [INFO] Going to loop over dir food-101/images\n","2023-03-01 11:48:24 [INFO] Found total 101000 images to run on\n","2023-03-01 12:19:09 [INFO] Found total 101000 images to run on\n","2023-03-01 12:29:58 [INFO] 648922) Finished write_index() NN model\n","2023-03-01 12:29:58 [INFO] Stored nn model index file fastdup_food101/nnf.index\n","2023-03-01 12:32:14 [INFO] Total time took 2630145 ms\n","2023-03-01 12:32:14 [INFO] Found a total of 170 fully identical images (d>0.990), which are 0.06 %\n","2023-03-01 12:32:14 [INFO] Found a total of 88 nearly identical images(d>0.980), which are 0.03 %\n","2023-03-01 12:32:14 [INFO] Found a total of 5236 above threshold images (d>0.900), which are 1.73 %\n","2023-03-01 12:32:14 [INFO] Found a total of 10100 outlier images (d<0.050), which are 3.33 %\n","2023-03-01 12:32:14 [INFO] Min distance found 0.379 max distance 1.000\n","2023-03-01 12:32:14 [INFO] Running connected components for ccthreshold 0.900000 \n",".0\n"," ########################################################################################\n","\n","Dataset Analysis Summary: \n","\n"," Dataset contains 101000 images\n"," Valid images are 100.00% (101,000) of the data, invalid are 0.00% (0) of the data\n"," Similarity: 1.70% (1,718) belong to 30 similarity clusters (components).\n"," 98.30% (99,282) images do not belong to any similarity cluster.\n"," Largest cluster has 79 (0.08%) images.\n"," For a detailed analysis, use `.connected_components()`\n","(similarity threshold used is 0.9, connected component threshold used is 0.9).\n","\n"," Outliers: 5.97% (6,029) of images are possible outliers, and fall in the bottom 5.00% of similarity values.\n"," For a detailed list of outliers, use `.outliers(data=True)`.\n","\n"," ########################################################################################\n","\n","Dataset Analysis Summary: \n","\n"," Dataset contains 101000 images\n"," Valid images are 100.00% (101,000) of the data, invalid are 0.00% (0) of the data\n"," Similarity: 1.70% (1,718) belong to 30 similarity clusters (components).\n"," 98.30% (99,282) images do not belong to any similarity cluster.\n"," Largest cluster has 79 (0.08%) images.\n"," For a detailed analysis, use `.connected_components()`\n","(similarity threshold used is 0.9, connected component threshold used is 0.9).\n","\n"," Outliers: 5.97% (6,029) of images are possible outliers, and fall in the bottom 5.00% of similarity values.\n"," For a detailed list of outliers, use `.outliers(data=True)`.\n"]},{"output_type":"execute_result","data":{"text/plain":["['Dataset contains 101000 images',\n"," 'Valid images are 100.00% (101,000) of the data, invalid are 0.00% (0) of the data',\n"," 'Similarity: 1.70% (1,718) belong to 30 similarity clusters (components).',\n"," '98.30% (99,282) images do not belong to any similarity cluster.',\n"," 'Largest cluster has 79 (0.08%) images.',\n"," 'For a detailed analysis, use `.connected_components()`\\n(similarity threshold used is 0.9, connected component threshold used is 0.9).\\n',\n"," 'Outliers: 5.97% (6,029) of images are possible outliers, and fall in the bottom 5.00% of similarity values.',\n"," 'For a detailed list of outliers, use `.outliers(data=True)`.']"]},"metadata":{},"execution_count":11}],"source":["# run with ccthreshold = 0.9 compared to default 0.96\n","fd.run(ccthreshold=0.9, overwrite=True)\n","fd.summary()"]},{"cell_type":"code","execution_count":12,"id":"38e38a88","metadata":{"id":"38e38a88","outputId":"df67f615-6956-4935-99ac-920a6537d906","colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"status":"ok","timestamp":1677675399104,"user_tz":-120,"elapsed":889,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" fastdup_id component_id sum count mean_distance min_distance \\\n","25103 25103 24810 73.0287 79.0 0.9244 0.9004 \n","40109 40109 24810 73.0287 79.0 0.9244 0.9004 \n","40824 40824 24810 73.0287 79.0 0.9244 0.9004 \n","40807 40807 24810 73.0287 79.0 0.9244 0.9004 \n","40787 40787 24810 73.0287 79.0 0.9244 0.9004 \n","\n"," max_distance img_filename error_code is_valid \n","25103 0.9467 club_sandwich/1318118.jpg VALID True \n","40109 0.9467 french_fries/147628.jpg VALID True \n","40824 0.9467 french_fries/3907871.jpg VALID True \n","40807 0.9467 french_fries/3832957.jpg VALID True \n","40787 0.9467 french_fries/3746805.jpg VALID True "],"text/html":["\n","
\n","
\n","
\n","\n","
\n"," \n","
\n","
\n","
fastdup_id
\n","
component_id
\n","
sum
\n","
count
\n","
mean_distance
\n","
min_distance
\n","
max_distance
\n","
img_filename
\n","
error_code
\n","
is_valid
\n","
\n"," \n"," \n","
\n","
25103
\n","
25103
\n","
24810
\n","
73.0287
\n","
79.0
\n","
0.9244
\n","
0.9004
\n","
0.9467
\n","
club_sandwich/1318118.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
40109
\n","
40109
\n","
24810
\n","
73.0287
\n","
79.0
\n","
0.9244
\n","
0.9004
\n","
0.9467
\n","
french_fries/147628.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
40824
\n","
40824
\n","
24810
\n","
73.0287
\n","
79.0
\n","
0.9244
\n","
0.9004
\n","
0.9467
\n","
french_fries/3907871.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
40807
\n","
40807
\n","
24810
\n","
73.0287
\n","
79.0
\n","
0.9244
\n","
0.9004
\n","
0.9467
\n","
french_fries/3832957.jpg
\n","
VALID
\n","
True
\n","
\n","
\n","
40787
\n","
40787
\n","
24810
\n","
73.0287
\n","
79.0
\n","
0.9244
\n","
0.9004
\n","
0.9467
\n","
french_fries/3746805.jpg
\n","
VALID
\n","
True
\n","
\n"," \n","
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":12}],"source":["# Now we see a much larger amount of images clustered together\n","cc90_df, _ = fd.connected_components()\n","cc90_df[cc90_df['count'] > 0.0].sort_values(by=['count'], ascending=False).head()"]},{"cell_type":"code","execution_count":13,"id":"bc129034","metadata":{"scrolled":true,"id":"bc129034","colab":{"base_uri":"https://localhost:8080/","height":1000,"output_embedded_package_id":"1tLYb9BG9aAoRmPD2Pm99LKnTZWjFU_n4"},"executionInfo":{"status":"ok","timestamp":1677675425276,"user_tz":-120,"elapsed":23407,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}},"outputId":"0c75e1cc-0586-431d-f11a-2f373e1ae922"},"outputs":[{"output_type":"display_data","data":{"text/plain":"Output hidden; open in https://colab.research.google.com to view."},"metadata":{}}],"source":["# let's see the new clusters\n","fd.vis.component_gallery(max_width=800)"]},{"cell_type":"markdown","id":"e06ab2b4","metadata":{"id":"e06ab2b4"},"source":["## Get a list of duplicates to remove"]},{"cell_type":"code","execution_count":14,"id":"717e6151","metadata":{"scrolled":true,"id":"717e6151","outputId":"f350e7e8-d0b8-4f01-8de0-200feef33ce8","colab":{"base_uri":"https://localhost:8080/","height":479},"executionInfo":{"status":"ok","timestamp":1677675458050,"user_tz":-120,"elapsed":384,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" img_filename \\\n","component_id \n","24810 [club_sandwich/1297247.jpg, club_sandwich/1318118.jpg, club_sandwich/1886101.jpg, club_sandwich/2778614.jpg, club_sandwich/3106065.jpg, club_sandwich/588478.jpg, french_fries/1099260.jpg, french_fries/1295274.jpg, french_fries/1361604.jpg, french_fries/1384733.jpg, french_fries/147628.jpg, french_fries/1610240.jpg, french_fries/1692353.jpg, french_fries/1700344.jpg, french_fries/1712331.jpg, french_fries/1740113.jpg, french_fries/1810352.jpg, french_fries/1969264.jpg, french_fries/2073415.jpg, french_fries/2246387.jpg, french_fries/2348229.jpg, french_fries/2369999.jpg, french_fries/2700217.jpg, french_fries/2761796.jpg, french_fries/2885926.jpg, french_fries/2936284.jpg, french_fries/3030853.jpg, french_fries/3069835.jpg, french_fries/3359887.jpg, french_fries/3405511.jpg, french_fries/3423618.jpg, french_fries/3499831.jpg, french_fries/3669402.jpg, french_fries/3673168.jpg, french_fries/3697215.jpg, french_fries/3746805.jpg, french_fries/3832957.jpg, french_fries/3907871.jpg, french_fries/467106.jpg, french_fries/889641.jpg] \n","18229 [chicken_curry/2394967.jpg, chicken_curry/2701143.jpg, chicken_curry/882723.jpg, hot_and_sour_soup/1151861.jpg, hot_and_sour_soup/1167380.jpg, hot_and_sour_soup/1400511.jpg, hot_and_sour_soup/1617113.jpg, hot_and_sour_soup/1670529.jpg, hot_and_sour_soup/2041812.jpg, hot_and_sour_soup/2367229.jpg, hot_and_sour_soup/2377494.jpg, hot_and_sour_soup/2520927.jpg, hot_and_sour_soup/3086202.jpg, hot_and_sour_soup/3113531.jpg, hot_and_sour_soup/3286625.jpg, hot_and_sour_soup/3428336.jpg, hot_and_sour_soup/3452669.jpg, hot_and_sour_soup/3552976.jpg, hot_and_sour_soup/3567487.jpg, hot_and_sour_soup/3568665.jpg, hot_and_sour_soup/3601021.jpg, hot_and_sour_soup/3706507.jpg, hot_and_sour_soup/387487.jpg, hot_and_sour_soup/478316.jpg, hot_and_sour_soup/564763.jpg, lobster_bisque/1346617.jpg, lobster_bisque/1826587.jpg, lobster_bisque/2917736.jpg, lobster_bisque/3282626.jpg, lobster_bisque/3319694.jpg, lobster_bisque/3358721.jpg, lobster_bisque/3414592.jpg, lobster_bisque/3466502.jpg] \n","26394 [crab_cakes/3467918.jpg, pad_thai/1709738.jpg, pad_thai/3059603.jpg, spaghetti_bolognese/3565695.jpg, spaghetti_carbonara/1117183.jpg, spaghetti_carbonara/1390373.jpg, spaghetti_carbonara/1559267.jpg, spaghetti_carbonara/1668631.jpg, spaghetti_carbonara/1739526.jpg, spaghetti_carbonara/1891700.jpg, spaghetti_carbonara/190173.jpg, spaghetti_carbonara/1936669.jpg, spaghetti_carbonara/1940255.jpg, spaghetti_carbonara/2228065.jpg, spaghetti_carbonara/2281641.jpg, spaghetti_carbonara/2567706.jpg, spaghetti_carbonara/2774715.jpg, spaghetti_carbonara/2796656.jpg, spaghetti_carbonara/2835081.jpg, spaghetti_carbonara/2967972.jpg, spaghetti_carbonara/2980079.jpg, spaghetti_carbonara/3045854.jpg, spaghetti_carbonara/3377897.jpg, spaghetti_carbonara/3581296.jpg, spaghetti_carbonara/3708340.jpg, spaghetti_carbonara/3908531.jpg, spaghetti_carbonara/560793.jpg, spaghetti_carbonara/733714.jpg, spaghetti_carbonara/755025.jpg] \n","\n"," mean_distance count \n","component_id \n","24810 0.9244 40 \n","18229 0.9250 33 \n","26394 0.9279 29 "],"text/html":["\n","
\n"," \n"," \n"," \n"," \n"," "]},"metadata":{}}],"source":["# visualize outliers\n","fd.vis.outliers_gallery()"]},{"cell_type":"markdown","id":"3a9268dc","metadata":{"id":"3a9268dc"},"source":["# Remove broken images\n","Using fastdup we are able to recover a list of files that are corrupted or that could not be loaded from various reasons. The reason is listed for each image. We will fetch them, and add them to our list of images to remove. Food-101 is meticoulsly curated, so just for the sake of demonstration we've added one empty image."]},{"cell_type":"code","execution_count":19,"id":"09080622","metadata":{"id":"09080622","outputId":"bab9ce79-8bf1-42c7-ccc5-b383b5c6787d","colab":{"base_uri":"https://localhost:8080/","height":49},"executionInfo":{"status":"ok","timestamp":1677675538250,"user_tz":-120,"elapsed":479,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":["Empty DataFrame\n","Columns: [img_filename, fastdup_id, error_code, is_valid]\n","Index: []"],"text/html":["\n","
\n"," \n"," \n"," \n"," \n"," "]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":[""],"image/png":"iVBORw0KGgoAAAANSUhEUgAAAZgAAAENCAYAAAAykHOlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiWUlEQVR4nO3de5RdZZnn8e8v0RBKCBRJbEYkhAgaEbs1KXpGtBctLISgiIOk8cK0opOEURtaHKYDJhqwbQLTXBzSdlI43aCsNHJT5BIyBCSiyEAlOIgQpIFwEUSCFUKoQCA888e7T7Kzc6rqFNQ+t/p91trrnLP3s996z6bIU+9+L1sRgZmZ2XAb1egKmJlZe3KCMTOzUjjBmJlZKZxgzMysFE4wZmZWCicYMzMrhROMmZmVwgnGzMxK4QRjZmalcIIxqwNJayWdJuleSS9K+t+S/kTSMkkvSFohqTOL/U+S7pC0XtL/k/SXuXJOlPRAds4jkubkjv2lpCclfU3SHyQ9LenE+n9bs8QJxqx+PgkcDrwTOBpYBpwBTCT9v3iypL2AG4C/B/YA/jtwtaSJWRl/AD4GjANOBC6QNC33M/YEdgP2Ar4I/FMlcZnVmxOMWf1cFBHPRMTvgNuB/xsR90TES8CPgPcDJwA3RsSNEfFaRNwM9ABHAUTEDRHxcCQrgf8D/EXuZ7wCnBURr0TEjcBG4F31+4pm2zjBmNXPM7n3m6p83gXYB5iZ3R5bL2k98CHgPwBImiHpTkl/zI4dBUzIlfNcRLya+9yXlWtWd29qdAXMbDtPAD+IiFnFA5J2Aq4G/hq4NiJekfRjQPWtollt3IIxay6XAUdLOkLSaEljs877twNjgJ2AZ4FXJc0APtLIypoNxAnGrIlExBPAMaTO/2dJLZrTgFER8QJwMnAF0At8BvhJg6pqNij5gWNmZlYGt2DMzKwUTjBmZlYKJxgzMyuFE4yZmZXC82AyEyZMiMmTJze6GmZmQ7NqVXqdPr1BP37VuoiYWO2YE0xm8uTJ9PT0NLoaZmZDo2yebYP+/ZL0WH/HfIvMzMxK4QRjZmalcIIxM7NSOMGYmVkp3MlvZtbKmni5L7dgzMysFE4wZmZWCicYM7NWNn16wyZZDsZ9MGZmrWz16kbXoF9OMGYNNHnuDVvfr1340QbWxGz4+RaZmZmVwgnGzMxKUdotMknHAacC7wLeAjwG/AA4NyI2ZzFrgX0Kpz4TEXsWyjoAuAj4ALAe+B5wZkRsycUIOB34b8AE4G7g5Ij41TB/NbMhyd8GMxtJyuyDGQ/cCvxPUlL4c2ABsCfwlVzcUlLyqNicL0RSJ7ACuB84BngHcB6p9TUvFzoXmA+cBqwhJbcVkg6MiN8P03cyM7MalZZgImJJYddPJY0DvizpbyK2Tj99OiLuHKCok4CdgWMjYgNwc1bOAknnRsQGSWNJCebsiFgEIOmXwFpSMptXvWgzsxY3a1aja9CvevfBPAeMGeI5M4DlWXKpuJyUdA7JPh8MjAOuqARExIvAddn5Zmbtqbs7bU2o9AQjabSkDkkfAk4G/jnXegH4oqTNkp6XdJWkYp/MVNItr60i4nGgLztWidkCPFQ494FcjFlTmzz3hq2bWTuoxzyYF4GdsvffJ/WRVFwL3Ak8Cbwb+CZwu6T3RsTzWUwnqQ+nqDc7VonZmO/0z8V0SBpTGViQJ2k2MBtg0qRJQ/xaZmZNoMGPTB5IPRLMwUAHqZP/G8Ai4EsAEXFKLu52SXcAvwJOBC4su2IR0Q10A3R1dTXvkqTWct5oK6R4vidhWr+6utJrE66qXHqCiYjKOgY/l7QOuFTSeRHxcJXY+yQ9CEzL7e4FdqtSdGd2rBKzi6TRhVZMJ9BXrfViZmblqncnfyXZ7DtATGRbxRoK/SiS9ia1itbkYkYD+xXK2qH/xszM6qPeCeaD2euj1Q5KOpCUFFbldi8DjpC0a27f8cAmYGX2+Q5gAzAzV1YHcHR2vpmZ1VmZM/lvIk2Q/A1phNcHga8BP4yIhyV9FDgBuB54ipRY5gGPA5fkilpMGn12jaRzgCmkCZvnV4YuR8RLkhYC8yX1sm2i5Si2n8RpZmZ1UmYfzN3A54HJwKvAI6SlXBZnx58A3krqzN+dNEfmJuCM/JyXiOiVdBhpcMB1pBFlF5CSTN5CUkI5nbSKQA9weEQ8M7xfy2xHZQ8t9qrL1orKnMk/n7R0S3/H7wUOq7Gs+4FDB4kJ4NvZZmZmDebnwZiZtbKenkbXoF9OMGZmrawJJ1hW+HkwZmZWCicYM7NWNnt22pqQE4yZWSu7+OK0NSH3wZi1GA9ZtlbhBGP2OnlZfbOB+RaZmZmVwgnGzMxK4QRjZmalcB+MWQvzg8mMadMGj2kQJxgzs1a2atXgMQ3iBGM2BB45ZlY798GYmVkpnGDMzFqZlLYm5ARjZmalKC3BSDpO0h2SnpP0kqQHJc2TNCYXI0lnSHpC0iZJP5P0viplHSDpFkl9kp6SdJak0YWYmsoya2eT596wdTNrtDJbMOOBW4H/CswA/gX4OnB+LmYu6amX5wBHAxuBFZL2rARI6gRWAAEcA5wFfA04s/DzBi3LzMzqp8xHJi8p7PqppHHAlyX9DbATKSmcHRGLACT9ElgLfAWYl513ErAzcGxEbABuzspZIOnciNggaWyNZZmZWZ3Uuw/mOaByi+xgYBxwReVgRLwIXEdq8VTMAJZnyaXiclLSOWSIZZmZWZ2UnmAkjZbUIelDwMnAP0dEAFOBLcBDhVMeyI5VTAXW5AMi4nGgLxdXa1lmZlYn9Zho+SLpdhjA94HTsvedwMaI2FKI7wU6JI2JiM1Z3Poq5fZmx4ZS1nYkzQZmA0yaNGlIX8pGBneWW9NbUuyNaB71SDAHAx3AnwPfABYBX6rDzx1URHQD3QBdXV3R4OqYmQ1dkz4uGeqQYCJidfb255LWAZdKOo/UuthF0uhCy6MT6Mu1OHqB3aoU3Zkdq8TUUpaZmdVJvTv5K8lmX1K/ymhgv0JMsc9lDYV+FEl7k1pFa3IxtZRlZtZeurvT1oTqnWA+mL0+CtwBbABmVg5K6iDNYVmWO2cZcISkXXP7jgc2ASuzz7WWZWbWXubMSVsTKu0WmaSbSBMkf0Ma4fVB0gTJH0bEw1nMQmC+pF5SS+NUUtK7KFfUYtLos2sknQNMARYA51eGLkfESzWWZTZi5Aco+Dkx1ghl9sHcDXwemAy8CjwCnE5KGBULSUngdNLM/x7g8Ih4phIQEb2SDiMNDriONKLsAlKSYShlmZlZ/ZQ5k38+aemWgWIC+Ha2DRR3P3DocJRlZmb14dWUzcysFH6ipdkIUJww6j4Zqwe3YMzMrBRuwZiZtbJo3kVI3IIxM7NSOMGYmVkpfIvMrMArKFtLmT49va5a1dh6VOEEY2bWylavHjymQXyLzMzMSuEWjNkI5HXKrB7cgjEzs1I4wZiZWSmcYMzMrBTugzEza2WzZjW6Bv1ygjEza2VN+rhkKPEWmaSZkn4i6XeSNkpaJenThZjbJEWVbWwhbi9JP5L0gqR1khZlj0Qu/sxZkh6S9FL28w4r6/uZmdnAymzBnAo8CnwVWAccBSyVNCEi8o8x/ilwRuHclytvJL0ZWA5sBj4F7A6cn72ekIv7NOlpmQuAnwMnAtdLOigi7hvG72Vm1jwqM/grM/qbSJkJ5uiIWJf7fKukt5ESTz7B/DEi7hygnOOAdwP7RcSjAJJeAS6XdGZEPJTFLQAujYhvZTErgfcDc8klIjOzttLVlV6bcFXl0m6RFZJLxT3A24ZY1Azg7kpyyfyY1KI5EkDSFOCdwBW5n/8acGV2vpmZ1Vm9hyl/APhtYd9HJPVl23JJf1o4PhVYk98REZuBh7Nj5F63iwMeAPaQNPGNV93MzIaibqPIsg73TwBfyO1eCVwK/DuwD/B14HZJfxYRa7OYTmB9lSJ7s2PkXotxvbnjz1ap02xgNsCkSZNq/SrWhryCstnwq0uCkTQZWApcGxGXVPZHxDdzYbdLWkFqhfxttpUqIrqBboCurq7mu4FpVgfF5Oq1yWy4lH6LTNIewDLgMeCzA8VGxO+BXwDTcrt7gd2qhHeyrYVSeS3GdRaOm5lZnZSaYLK5KtcDY4CPRURfDadFtlWsYVsfS6XcMcAUtvW5VF63i8s+/zEidrg9ZmZm5SpzouWbSKO49geOjIg/1HDOnsCHgPyj2ZYBB0naJ7fv48BOwE0AEfEIafDAzFxZo7LPy97YNzEza2I9PWlrQmX2wXyXNLnyFGC8pPG5Y/cA7wLOJiWhx4BJwOnAa8CFudirSJ3/10iaT7oNdgGwNDcHBtI8mMskrSXdZvscKbl9Zpi/l5lZ82jCCZYVZSaYj2Sv36lybF/gOUCkJDMeeAG4DfhERDxeCYyIVyQdCSwizXN5GbgcOC1fYET8m6RdgL8D5gO/Id2W8yx+M7MGKC3BRMTkGsKOqrGsJ0lDnAeLuxi4uJYyzczawuzZ6bUJF73082DMzFrZxRenrQk5wZiZWSlqSjCS3lt2RczMrL3U2oL5rqS7JH1JUrVJj2ZmZtupqZM/Iv5C0v6kdcRWSboL+NeIuLnU2pmVxGuPmZWv5j6YbM7JPNIw4EOA/yVpjaRjy6qcmZm1rppaMNkS+icCHwVuJj1MbHX2ALFfAteUV0UzM+vXtGmDxzRIrfNgLgK+B5wREZsqOyPiKUnzSqmZmTVE/vahV1ZuAatWDR7TILUmmI8CmyJiC2xd52tsRPRFxA9Kq52ZmbWsWvtgVgA75z53ZPvMzMyqqjXBjI2IjZUP2fuOcqpkZmY1k9LWhGpNMC9K2tqTJGk6sGmAeDMzG+Fq7YP5W+BKSU+RVkDeEzi+rEqZmVnrq3Wi5d2SppKe4QLwYES8Ul61zMys1Q1luf6DgMnZOdMkERHfL6VWZmbW8mpd7PIHwD+SHmd8ULZ1DXLOTEk/kfQ7SRslrZL06SpxsyQ9JOmlLOawKjF7SfqRpBckrZO0SNIOgwxqKcvMzOqj1hZMF3BARMQQyj4VeBT4KrCO9HCxpZImRMRFAFnCWUx63PHPSasFXC/poMqTKCW9GVgObAY+BewOnJ+9nlD5YbWUZWZm9VNrgrmP1LH/9BDKPjoi1uU+35otLXMqaWUASMng0oj4FoCklcD7gblsSx7HAe8G9ouIR7O4V4DLJZ2ZrZFWa1lmZu1lyZJG16BftSaYCcD92SrKL1d2RsTH+zuhkFwq7gE+CSBpCvBO4JTcOa9JujK/D5gB3F1JLpkfk1o0RwIPDaEsM7P2UnlkchOqNcEsGKaf9wHgt9n7qdnrmkLMA8AekiZGxLNZ3P35gIjYLOnhXBm1lmUjmJfoHzqvS2ZvRK3DlFdK2gfYPyJWZB3so4fyg7IO90+QnikD0Jm9ri+E9uaOP5u9FmMqcZ252FrKKtZpNjAbYNKkSQPW38ysKXV3p9cmbMnUOopsFnAVULnZtxfpNlVNJE0GlgLXRsQlQ6phiSKiOyK6IqJr4sSJja6OmdnQzZmTtiZU61IxXwY+CGyArQ8fe2stJ0raA1gGPAZ8Nneo0rooPoK5s3C8t0pMJa63EDtYWWZmVie1JpiXI2Jz5YOkNwGDDlnObqVdD4wBPhYRfbnDlf6SqYXTpgJ/zPWZrCnGSBoDTMmVUWtZZmZWJ7UmmJWSzgB2lnQ4cCVw3UAnZEnoSmB/4MiI+EP+eEQ8Qurwn5k7Z1T2eVkudBlwUNYHVPFxYCfgpiGWZWZmdVLrKLK5wBeBXwNzgBtJT7gcyHdJkytPAcZLGp87dk9EvEwanXaZpLXAL4DPkRLSZ3KxVwFfB66RNJ90G+wCYGluDgw1lmVmZnVS6yiy14CLs61WH8lev1Pl2L7A2oj4N0m7AH8HzAd+Q7qVtnXmfUS8IulIYBFwBWkezuXAaYU6DlqWmZnVT00JRtKjVOlziYgp/Z0TEZNrKTsiBk1cEfEkaYjzGy7LzMzqYyhrkVWMJfVt7DH81TGzZlWcqOqJl01iSEtE1ldNnfwR8Vxu+11EXAj4t8vMzPpV6y2yabmPo0gtmqE8S8asIbw8jFnj1Jokzsu9fxVYC/zVsNfGzMyGZvr09LpqVWPrUUWto8g+XHZFzMzsdVi9utE16Fett8hOHeh4RJw/PNUxM7N2MZRRZAcBP8k+Hw3cBTzU7xlmZjai1Zpg3g5Mi4gXACQtAG6ICD8p0szMqqp1LbI/IT1BsmJzts/MzKyqWlsw3wfukvSj7PMngEtLqZGZmbWFWkeRfVvSMuAvsl0nRsQ95VXLzJqdH6fcJGbNanQN+jWUyZIdwIaI+FdJEyXtGxGPllUxMzOrQeWRyU2o1kcmf5O0SvHp2a43A5eVVSkzM2t9tXby/2fSQ75eBIiIp4Bdy6qUmZnVaNWqppzFD7XfItscESEpACS9pcQ6mZlZrbqyxe6bcFXlWlswV0haAuwuaRawAj93xczMBjBogpEk4IekRxdfDbwL+EZEXFTDuftJWiLpXklbJN1WJWatpChsv68Sd4CkWyT1SXpK0lmSRhfrKukMSU9I2iTpZ5LeN1g9zeyNmTz3hq2bWcWgt8iyW2M3RsR7gZuHWP57gKOAO0kDA/qzFMgnrPykTiR1klpN9wPHAO8grfA8CpiXC51LelzyacAa4FRghaQDI2KHpGVmZuWptQ9mtaSDIuLuIZZ/XURcCyDpKmBCP3FPR8SdA5RzErAzcGxEbABuljQOWCDp3IjYIGksKcGcHRGLsp/5S9KjBb7C9onI2pT/gjZrHrX2wfxH4E5JD2e3u34t6d7BToqI195Y9baaASzPkkvF5aSkc0j2+WBgHHBF7ue/CFyXnW9mZnU0YIKRNCl7ewQwBTiUtJLyx7LX4fJFSZslPS/pKkn7FI5PJd3y2ioiHgf6smOVmC3suMLzA7kYMzOrk8Fukf2YtIryY5KujohPllCHa0l9NE8C7wa+Cdwu6b0R8XwW0wmsr3Jub3asErMxIrZUiemQNCYiin07s4HZAJMmTcLMrOX09DS6Bv0aLMEo935KGRWIiFNyH2+XdAfwK+BE4MIyfmbuZ3cD3QBdXV3NN4jczGwwlUcmN6HB+mCin/eliYj7gAeBabndvcBuVcI7s2OVmF2KQ5ezmL5i68XMzMo1WAvmzyRtILVkds7ek32OiBhXUr2C7RPaGgr9KJL2Ji3AuSYXMxrYj5SgKnbovzEzaxuzZ6fXJlz0csAWTESMjohxEbFrRLwpe1/5XEpykXQgKSnkF9dZBhwhKb/+2fHAJmBl9vkOYAMwM1dWB2kwwrIy6mpm1nAXX5y2JjSU5fqHLPsH/qjs417AOEnHZZ9vBD4MnABcDzxFSizzgMeBS3JFLQZOBq6RdA6pP2gBcH5l6HJEvCRpITBfUi/bJlqOYvtJnNZmPPeluRT/e/hZMSNXqQkGeCtwZWFf5fO+wBNZzIXA7sBzwE3AGfk5LxHRK+kwYBFpXst64AJSkslbSEoopwPjgR7g8Ih4Zpi+j5mZ1ajUBBMRa9l+JFo1h9VY1v2keTgDxQTw7WwzM7MGqnUmv5mZ2ZA4wZiZWSnK7oMxM7MyTZs2eEyDOMGYmbWyJn1cMvgWmZmZlcQJxszMSuFbZNZyPLHSLEfZTJBovvV63YIxM7NSOMGYmVkpfIvMzEqVv6XpdclGFrdgzMysFE4wZmZWCicYMzMrhftgzMxa2ZIlja5BvxRNOHa6Ebq6uqKnp6fR1bB+eO5L+3GHf3uQtCoiuqodK/UWmaT9JC2RdK+kLZJuqxIjSWdIekLSJkk/k/S+KnEHSLpFUp+kpySdJWn06ynLzMzKV3YfzHtIj0x+EPhtPzFzgfnAOcDRwEZghaQ9KwGSOoEVQADHAGcBXwPOHGpZZmZtpbs7bU2o7ARzXUTsHREzgd8UD0oaS0oKZ0fEoohYAcwkJZKv5EJPAnYGjo2ImyNiMSm5nCpp3BDLMjNrH3PmpK0JlZpgIuK1QUIOBsYBV+TOeRG4DpiRi5sBLI+IDbl9l5OSziFDLMvMzOqg0cOUpwJbgIcK+x/IjuXj1uQDIuJxoC8XV2tZZmZWB41OMJ3AxojYUtjfC3RIGpOLW1/l/N7s2FDK2krSbEk9knqeffbZ1/sdzMysikYnmIaKiO6I6IqIrokTJza6OmZmbaXREy17gV0kjS60PDqBvojYnIvbrcr5ndmxoZRlLcJzX8xaW6MTzBpgNLAfaShzRbHPZQ2FfhRJewMdubhayzKzJlD8A8ITL9tPo2+R3QFsIA0nBkBSB2kOy7Jc3DLgCEm75vYdD2wCVg6xLDOz9hHRlE+zhJJbMNk/8EdlH/cCxkk6Lvt8Y0T0SVoIzJfUS2ppnEpKfBfliloMnAxcI+kcYAqwADi/MnQ5Il6qsSwzM6uDsm+RvRW4srCv8nlfYC2wkJQETgfGAz3A4RHxTOWEiOiVdBiwiDSvZT1wASnJ5A1alpmZ1UepCSYi1gIaJCaAb2fbQHH3A4cOR1lmZm1j+vT0umpVY+tRRaM7+c3M7I1YvbrRNeiXE4yZNYX8qDKPKGsPjR5FZmZmbcotGGsanlhp1l7cgjEzs1I4wZiZWSl8i8zMmo47/Idg1qxG16BfTjBmZq2sSR+XDL5FZmZmJXELxhrKI8fM3qDKDP7KjP4m4gRjZtbKurrSaxOuqOxbZGZmVgonGDMzK4UTjJmZlcJ9MFZ37tg3Gxka3oKR9HlJUWU7KRcjSWdIekLSJkk/k/S+KmUdIOkWSX2SnpJ0lqTRdf1CZjasJs+9YbvNWkcztWAOBTblPj+Sez8XmA+cxrZHIa+QdGBE/B5AUiewArgfOAZ4B3AeKYnOK7321i//o2A2MjVTgrk7IjYWd0oaS0owZ0fEomzfL0mPW/4K25LHScDOwLERsQG4WdI4YIGkc7N9Zmbtpaen0TXoV8NvkdXgYGAccEVlR0S8CFwHzMjFzQCWFxLJ5aSkc0gd6mlmVn/TpzflJEtorgTzsKRXJT0oaU5u/1RgC/BQIf6B7Fg+bk0+ICIeB/oKcWZmVgfNcIvsaVL/yl3AaOBTwGJJHRFxAdAJbIyILYXzeoEOSWMiYnMWt75K+b3ZsR1Img3MBpg0adIwfBUzszqbPTu9NuGilw1PMBGxHFie27Us63eZJ+k7Jf/sbqAboKurq/nWWTCzHXgp/4KLL06vTZhgmukWWd5VwB7AZFILZJcqw407gb6s9UIWt1uVsjqzY2ZmVkcNb8H0I3Kva0i3zvYDHszFFPtc1lDoa5G0N9BRiLM68NBkM2vWFsxxwDrgMeAOYAMws3JQUgdwNLAsd84y4AhJu+b2HU+aW7Oy7Aqbmdn2Gt6CkXQ1qYP/XlJL5fhsOzkiXgNekrQQmC+pl20TLUcBF+WKWgycDFwj6RxgCrAAON9zYOrDrRYzy2t4giHd9voCsDcg0kz8v46IH+RiFpISyunAeKAHODwinqkERESvpMOARaQ5MuuBC0hJxszM6kzRhA+paYSurq7oaeIZsc3ILRZrJiN2RFllkmXlyZZ1JmlVRHRVO9YMLRhrIU4q1qyKv5sjJuE0KLHUolk7+c3MrMU5wZiZWSl8i8wG5dtiZk1MSq9N2J/uBGNmbclLyjSeb5GZmVkp3IIxs7bn1kxjOMEY4P8BzWz4OcHYDtypb2bDwX0wZmZWCrdg2pxbI2bba7sZ/0uWNLoG/XKCMTNrZZVHJjchJxgzG9E8wKU8TjBtyLfFzEaQ7u702oQtGSeYNuCEYjaCzZmTXp1gzMyal2+XDa+2SzCSDiA9SvkDpKdafg84MyK2NLJew8EtFbP6Gej/Nyef2rRVgpHUCawgPXb5GOAdwHmk+T7zGlg1M2sjbTfUuSRtlWCAk4CdgWMjYgNws6RxwAJJ52b7mppbKWatp7//b0d64mm3BDMDWF5IJJcD5wCHANc1pFYFTiJmI8NIv83WbglmKnBrfkdEPC6pLzs27AnGycLMXo/h+rdj7bCUUo52SzCdpI79ot7s2HYkzQYqY/s2SnqwvKq1pAnAukZXoon5+gzM12dgw3J9tPWNBgor0z79HWi3BDMkEdENdDe6Hs1KUk9EdDW6Hs3K12dgvj4DGwnXp91WU+4FdquyvzM7ZmZmddJuCWYNqa9lK0l7Ax3ZMTMzq5N2SzDLgCMk7ZrbdzywCVjZmCq1NN8+HJivz8B8fQbW9tdHEdHoOgybbKLl/cB9pKHJU4DzgQsjwhMtzczqqK0SDGxdKmYR2y8Vs6AdlooxM2slbZdgzMysObRbH4wNgaTPS4oq20m5GEk6Q9ITkjZJ+pmk9zWw2qWRtJ+kJZLulbRF0m1VYmq6HpIOkHSLpD5JT0k6S9LoenyPstR4fdZW+X36fZW4tro+kmZK+omk30naKGmVpE9XiZsl6SFJL2Uxh1WJ2UvSjyS9IGmdpEWSOurzTYbXiJ4HY1sdShoIUfFI7v1cYD5wGmkk3qnACkkHRsQO/3C0uPcARwF3Am/uJ2bQ69HGi67Wcn0AlpJWNK/YnD/YptfnVOBR4KukyZNHAUslTYiIiwCyhLMYWAD8HDgRuF7SQRFxXxbzZmA56Zp9Ctid1I+8O3BC/b7OMIkIbyN0Az4PBLBLP8fHAs8D38jtewvwLPD3ja5/CddjVO79VcBtr+d6AKeT5l2Ny+37H0Bffl+rbYNdn2z/WuAfBymn7a4PMKHKvqXAo7nPDwL/kr+ewK+By3L7Pg1sAfbN7fsr4DVg/0Z/z6FuvkVmAzkYGAdcUdkRES+S1nSb0ahKlSUiXhskpNbr0d+iqzuTFl1tSTVcn1q13fWJiGpLvtwDvA1A0hTgnWz/u/MacCU7/u7cHRGP5vb9mNSiOXJ4a10+JxgDeFjSq5IelDQnt38q6a+phwrxD1CY0DpC1Ho9plKY2BsRj5P+Qh8J1+2LkjZLel7SVZKKa1WNlOvzAeC32fvK9ypO+H4A2EPSxFxc8dpsBh6mBa+N+2BGtqdJ/Ql3AaNJ93wXS+qIiAtIS+xsjB2HePcCHZLGZL/8I0Wt12NIi662mWtJfTRPAu8GvgncLum9EfF8FtP21yfrvP8E8IVsV+V7rS+E9uaOP0ubXRsnmBEsIpaTOhQrlkkaC8yT9J0GVctaWESckvt4u6Q7gF+ROrQvbESd6k3SZFL/y7URcUlja9NYvkVmRVcBewCTSX817VJl+Ggn0DfCWi9Q+/XwoquZSKOjHgSm5Xa37fWRtAdpyarHgM/mDlW+V/F7dxaOt9W1cYKxosi9riHdOtuvELPDfeIRotbr4UVXtxds+72CNr0+2VyV64ExwMcioi93uPK9iv0oU4E/RsSzubjitRlDWvaq5a6NE4wVHUcax/8YcAewAZhZOZj9T3Q06a+0kabW6+FFVzOSDiT9g7kqt7vtro+kN5FGhO0PHBkRf8gfj4hHSB3++d+dUdnn4u/OQYWBER8HdgJuKqf25XEfzAgm6WpSB/+9pL/Mj8+2k7MhlC9JWgjMl9TLtomFo9h+Il1byJLFUdnHvYBxko7LPt8YEX01Xo/FwMnANZIqi64uAM4vDM1tKYNdH+DDpMmA1wNPkRLLPOBx4JJcUe14fb5LujanAOMljc8duyciXiZ9x8skrQV+AXyOlJA+k4u9Cvg66drMJ90uuwBYGhHF0YvNr9ETcbw1bgP+gXR/vI/01+Mq4L8UYkT6hX8yi7kdeH+j617S9ZjMtts5xW3yUK4HcABwaxbzNPAtYHSjv2OZ1wf4U+AW0mioV4DfkxLL29r9+pAmmA74u5PFzQL+HXgZWA0cVqWst5PmvmwEngP+Ceho9Hd8PZsXuzQzs1K4D8bMzErhBGNmZqVwgjEzs1I4wZiZWSmcYMzMrBROMGZmVgonGDMzK4UTjJmZleL/A7HsM2qXQ/V2AAAAAElFTkSuQmCC\n"},"metadata":{"needs_background":"light"}}],"source":["# visualize bright images\n","fd.vis.stats_gallery(metric='bright')"]},{"cell_type":"markdown","id":"d2f376c8","metadata":{"id":"d2f376c8"},"source":["# Wrap up and collect file list"]},{"cell_type":"code","execution_count":25,"id":"f40c1b53","metadata":{"id":"f40c1b53","executionInfo":{"status":"ok","timestamp":1677675704901,"user_tz":-120,"elapsed":711,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[],"source":["# let's collect all files to remove\n","stats_df = fd.img_stats()\n","outlier_df = fd.outliers()\n","\n","# for outliers, we'll take images which their closest similarity is lower than 0.68\n","outliers_filtered = outlier_df[outlier_df.distance < 0.68].img_filename_outlier.tolist()\n","\n","images_to_remove = dict(\n"," duplicates = list(cluster_images_to_discard), # the duplicates threshold is 0.9, the same ccthreshold we ran fastdup with\n"," outliers_to_discard = outliers_filtered,\n"," invalid_images = fd.invalid_instances(),\n"," blurry_images = stats_df[stats_df['blur'] < 50], # we'll choose blur threshold as 50\n"," dark_images = stats_df[stats_df['mean'] < 13], # darkness threshold as 13\n"," bright_images = stats_df[stats_df['mean'] > 220.5], # and brightness threshold as 220.5\n",")"]},{"cell_type":"code","execution_count":26,"id":"b5e24a96","metadata":{"id":"b5e24a96","outputId":"a63f32ff-47c5-4eac-a3df-9181838b496e","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1677675707307,"user_tz":-120,"elapsed":284,"user":{"displayName":"Tom Shani","userId":"00667426488827942961"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["duplicates: 2153\n","outliers_to_discard: 174\n","invalid_images: 0\n","blurry_images: 30\n","dark_images: 6\n","bright_images: 13\n","Total - 2368 unique images\n"]}],"source":["# we'll create a set of unique images to remove\n","images_to_discard = []\n","\n","for key, data_slice in images_to_remove.items():\n"," ext = data_slice if isinstance(data_slice, list) else data_slice['img_filename'].tolist()\n"," print(f\"{key}: {len(data_slice)}\")\n"," images_to_discard.extend(ext)\n","print(f\"Total - {len(set(images_to_discard))} unique images\")"]},{"cell_type":"markdown","id":"17ceb0d6","metadata":{"id":"17ceb0d6"},"source":["# Summary \n","Even under restrictive thresholds, we have found 2,368 images that should not be included in our labeling or training using this data."]},{"cell_type":"code","source":[],"metadata":{"id":"Y1shK2sIQ-oL"},"id":"Y1shK2sIQ-oL","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.16"},"vscode":{"interpreter":{"hash":"5b6e8fba36db23bc4d54e0302cd75fdd75c29d9edcbab68d6cfc74e7e4b30305"}},"colab":{"provenance":[]}},"nbformat":4,"nbformat_minor":5}