{ "cells": [ { "cell_type": "markdown", "id": "4beb231b", "metadata": {}, "source": [ "# Find duplicate granules in OPERA CSLC-S1 dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "a2b07950", "metadata": {}, "outputs": [], "source": [ "import leafmap\n", "import pandas as pd\n", "import geopandas as gpd\n", "from datetime import datetime\n", "import re" ] }, { "cell_type": "markdown", "id": "89393dda", "metadata": {}, "source": [ "To and access the data, you will need to create an Earthdata login. You can register for an account at [urs.earthdata.nasa.gov](https://urs.earthdata.nasa.gov)." ] }, { "cell_type": "code", "execution_count": null, "id": "097cc873", "metadata": {}, "outputs": [], "source": [ "leafmap.nasa_data_login()" ] }, { "cell_type": "markdown", "id": "c54840ce", "metadata": {}, "source": [ "## View available OPERA product metadate\n", "TSV of NASA Earthdata products is available in the [NASA-Earth-Data](https://github.com/opengeos/NASA-Earth-Data) repo. We filter to just OPERA products." ] }, { "cell_type": "code", "execution_count": null, "id": "a5760a6e", "metadata": {}, "outputs": [], "source": [ "url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'\n", "earth_data_df = pd.read_csv(url, sep='\\t')\n", "opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]\n", "opera_df" ] }, { "cell_type": "markdown", "id": "2116d9b8", "metadata": {}, "source": [ "## Load all CSLC-S1 graunules into a geodataframe\n", "For reference, as of Jan. 2024 there are ~225,000 CSLC-S1 granules, and it takes about 6 minutes to load it into the geodataframe." ] }, { "cell_type": "code", "execution_count": null, "id": "186bf525", "metadata": {}, "outputs": [], "source": [ "results, gdf = leafmap.nasa_data_search(\n", " short_name='OPERA_L2_CSLC-S1_V1',\n", " cloud_hosted=True,\n", " bounding_box= (-180.0, -90.0, 180, 90.0),\n", " temporal=(\"2014-06-15\", str(datetime.now().date())),\n", " count=-1, # use -1 to return all datasets\n", " return_gdf=True,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "0d439f64", "metadata": {}, "outputs": [], "source": [ "gdf.tail()" ] }, { "cell_type": "markdown", "id": "f716122c", "metadata": {}, "source": [ "### Make a list of 'native-id' from the files" ] }, { "cell_type": "code", "execution_count": null, "id": "e2c4a73b", "metadata": {}, "outputs": [], "source": [ "identifier_list = gdf['native-id'].tolist()\n", "print('Total granules:', len(identifier_list))\n", "print(identifier_list[0:2])" ] }, { "cell_type": "markdown", "id": "f9d43540", "metadata": {}, "source": [ "## Find duplicates" ] }, { "cell_type": "markdown", "id": "1cc35f0a", "metadata": {}, "source": [ "### Isolate the part of the file name that would be the same for duplicate granules" ] }, { "cell_type": "code", "execution_count": null, "id": "b98a0edd", "metadata": {}, "outputs": [], "source": [ "### Access the parts that may indicate true duplicates from the identifier name\n", "print(identifier_list[0][0:-29]) # burst ID" ] }, { "cell_type": "markdown", "id": "5265ed6c", "metadata": {}, "source": [ "### Create a set of duplicates and unique identifiers" ] }, { "cell_type": "code", "execution_count": null, "id": "bf1222e4", "metadata": {}, "outputs": [], "source": [ "duplicate_identifiers = set()\n", "unique_identifiers = set()\n", "\n", "for identifier in identifier_list:\n", " potential_duplicate_portion = identifier[0:-29]\n", "\n", " # Check if the identifier is already in the set\n", " if potential_duplicate_portion in unique_identifiers:\n", " duplicate_identifiers.add(potential_duplicate_portion)\n", " else:\n", " # Add the identifier to the set if it's not a duplicate\n", " unique_identifiers.add(potential_duplicate_portion)\n", "\n", "# If you need the result as a list, you can convert the sets back to lists\n", "duplicate_identifiers_list = list(duplicate_identifiers)\n", "unique_identifiers_list = list(unique_identifiers)" ] }, { "cell_type": "code", "execution_count": null, "id": "bc3b50d0", "metadata": {}, "outputs": [], "source": [ "print(f'Total CSLC-S1 granules as of {datetime.now().strftime(\"%d-%m-%Y\")}:', len(identifier_list))\n", "print('Granules with more than one version:',len(duplicate_identifiers))" ] }, { "cell_type": "markdown", "id": "6962c147", "metadata": {}, "source": [ "### Collect duplicates for each granulate where duplicates exist" ] }, { "cell_type": "code", "execution_count": null, "id": "faae27c6", "metadata": {}, "outputs": [], "source": [ "# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value\n", "granules_dictionary = {}\n", "\n", "# Create a list to store pairs of potentially duplicated elements\n", "duplicate_pairs = []\n", "\n", "# Iterate over the elements in the list\n", "for granule in identifier_list:\n", " # Extract the potentially duplicated portion\n", " potential_duplicate_portion = granule[0:-29]\n", " \n", " # If the potential duplicate portion is not in the dictionary, add it with the entire element\n", " if potential_duplicate_portion not in granules_dictionary:\n", " granules_dictionary[potential_duplicate_portion] = [granule]\n", " else:\n", " # If the potential duplicate portion is already in the dictionary, add the entire element to the list\n", " granules_dictionary[potential_duplicate_portion].append(granule)\n", "\n", "# Create pairs from the dictionary values\n", "for granules in granules_dictionary.values():\n", " if len(granules) > 1:\n", " duplicate_pairs.append(granules)" ] }, { "cell_type": "markdown", "id": "2f1d27fd", "metadata": {}, "source": [ "### Print the number of duplicates for each granule" ] }, { "cell_type": "code", "execution_count": null, "id": "a4ccb4d7", "metadata": {}, "outputs": [], "source": [ "print(f'Total CSLC-S1 granules as of {datetime.now().strftime(\"%d-%m-%Y\")}:', len(identifier_list))" ] }, { "cell_type": "code", "execution_count": null, "id": "66fa2c23", "metadata": {}, "outputs": [], "source": [ "duplicates = []\n", "for pair in duplicate_pairs:\n", " for granule in pair:\n", " duplicates.append(granule)\n", "print(f'Total CSLC-S1 duplicate tiles as of {datetime.now().strftime(\"%d-%m-%Y\")}: {len(duplicates)} granules')" ] }, { "cell_type": "code", "execution_count": null, "id": "fa1f4ff6", "metadata": {}, "outputs": [], "source": [ "one_duplicate = []\n", "for pair in duplicate_pairs:\n", " if len(pair) > 1 and len(pair) < 3:\n", " one_duplicate.append(pair)\n", "print(f'Total CSLC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime(\"%d-%m-%Y\")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')" ] }, { "cell_type": "code", "execution_count": null, "id": "6b3fc8db", "metadata": {}, "outputs": [], "source": [ "two_duplicates = []\n", "for pair in duplicate_pairs:\n", " if len(pair) > 2:\n", " two_duplicates.append(pair)\n", "print(f'Total CSLC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime(\"%d-%m-%Y\")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')" ] }, { "cell_type": "code", "execution_count": null, "id": "d01b9ac0", "metadata": {}, "outputs": [], "source": [ "three_duplicates = []\n", "for pair in duplicate_pairs:\n", " if len(pair)>3:\n", " three_duplicates.append(pair)\n", "print(f'Total CSLC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime(\"%d-%m-%Y\")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')" ] }, { "cell_type": "code", "execution_count": null, "id": "78ba1e26", "metadata": {}, "outputs": [], "source": [ "### add the url to the duplicate names\n", "duplicate_urls = []\n", "for pair in duplicate_pairs:\n", " pair_urls = []\n", " for granule in pair:\n", " pair_urls.append('https://datapool.asf.alaska.edu/CSLC/OPERA-S1/'+str(granule)+'.h5')\n", " duplicate_urls.append(pair_urls)\n", " " ] }, { "cell_type": "markdown", "id": "68364d8e", "metadata": {}, "source": [ "### Add the url, burst IDs, and dates to the duplicate names" ] }, { "cell_type": "code", "execution_count": null, "id": "40fab8c3", "metadata": {}, "outputs": [], "source": [ "### burst ids\n", "burst_ids = []\n", "dates = []\n", "for pair in duplicate_pairs:\n", " burst_ids.append(pair[0][17:32])\n", " dates.append(pair[0][33:41])" ] }, { "cell_type": "code", "execution_count": null, "id": "cde71dca", "metadata": {}, "outputs": [], "source": [ "duplicates_df = pd.DataFrame({\n", " 'burst_id': burst_ids,\n", " 'date': dates,\n", " 'duplicates': duplicate_urls\n", "})" ] }, { "cell_type": "markdown", "id": "b1c1c659", "metadata": {}, "source": [ "### Make dataframe of all duplicates" ] }, { "cell_type": "code", "execution_count": null, "id": "a3eb392a", "metadata": {}, "outputs": [], "source": [ "#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()\n", "df_final = pd.concat([duplicates_df[['burst_id', 'date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)\n", "\n", "# Rename the columns\n", "df_final.columns = ['burst_id', 'date', 'duplicate_1', 'duplicate_2', 'duplicate_3']\n", "\n", "# Sort by burst_id \n", "sorted_df = df_final.sort_values(by='burst_id')" ] }, { "cell_type": "code", "execution_count": null, "id": "55f525e9", "metadata": {}, "outputs": [], "source": [ "sorted_df.head()" ] }, { "cell_type": "markdown", "id": "1e45a5b9", "metadata": {}, "source": [ "### Add columns of interest from the original geodataframe for each duplicate and format it nicely (probably could be improved, but should work)" ] }, { "cell_type": "code", "execution_count": null, "id": "06b26c6b", "metadata": {}, "outputs": [], "source": [ "df2 = pd.DataFrame(gdf)\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "2cb5f793", "metadata": {}, "outputs": [], "source": [ "# Function to extract acquisition and processing times as datetime objects\n", "def extract_portion(url):\n", " if pd.notna(url):\n", " match = re.search(r'([^/]+)\\.h5', url)\n", " if match:\n", " info_string = match.group(1)\n", " return info_string\n", " return None\n", "\n", "# Apply the function to extract the portion and create a new column\n", "sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)\n", "sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)\n", "sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)\n", "\n", "merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')\n", "merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')\n", "merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')\n", "\n", "merged_df1['revision-date-1'] = merged_df1['revision-date']\n", "merged_df2['revision-date-2'] = merged_df2['revision-date']\n", "merged_df3['revision-date-3'] = merged_df3['revision-date']\n", "\n", "merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']\n", "merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']\n", "merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']\n", "\n", "merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))\n", "merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))\n", "\n", "merged_df['burst_id'] = merged_df['burst_id_df2']\n", "merged_df['duplicate_2'] = merged_df['duplicate_2_df2']\n", "merged_df['duplicate_3'] = merged_df['duplicate_3_df2']\n", "\n", "suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']\n", "\n", "# Iterate over the suffixes and drop columns\n", "for suffix in suffixes_to_remove:\n", " columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]\n", " merged_df = merged_df.drop(columns=columns_to_drop)\n", "\n", "\n", "merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)\n", "#merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']\n", "\n", "common_column = 'extracted_portion_duplicate_1'\n", "column_to_include = 'BeginningDateTime'\n", "\n", "# Merge the DataFrames based on the common column\n", "merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')\n", "\n", "# Drop the duplicate columns and rename the result column\n", "merged_df = merged_df.drop(columns=['extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})\n", "\n", "merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']\n", "\n", "columns_to_drop = ['date','extracted_portion_duplicate_2', 'extracted_portion_duplicate_3',\n", " 'size', 'concept-type', 'concept-id', 'revision-id', 'native-id_x',\n", " 'provider-id', 'format', 'revision-date', 'BeginningDateTime_x',\n", " 'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',\n", " 'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',\n", " 'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',\n", " 'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',\n", " 'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',\n", " 'geometry', 'BeginningDateTime_y','native-id_y']\n", "\n", "merged_df = merged_df.drop(columns=columns_to_drop)\n", "\n", "# Specify the desired column order\n", "desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3']\n", "\n", "# Create a new DataFrame with the specified column order\n", "merged_df = merged_df[desired_order]\n", "merged_df.head()" ] }, { "cell_type": "markdown", "id": "0a2e2dd8", "metadata": {}, "source": [ "## Output dataframe as csv" ] }, { "cell_type": "code", "execution_count": null, "id": "086080be", "metadata": {}, "outputs": [], "source": [ "# Output the DataFrame to a CSV file\n", "csv_file_path = f'CSLC-S1_duplicates_{datetime.now().strftime(\"%d-%m-%Y\")}.csv'\n", "merged_df.to_csv(csv_file_path, index=False)" ] }, { "cell_type": "markdown", "id": "7f5a10f4", "metadata": {}, "source": [ "## Output duplicates as geojson" ] }, { "cell_type": "code", "execution_count": null, "id": "8e2c1ecf", "metadata": {}, "outputs": [], "source": [ "# make geodataframe\n", "duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]\n", "\n", "columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',\n", " 'provider-id', 'format', 'revision-date', 'BeginningDateTime',\n", " 'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',\n", " 'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',\n", " 'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',\n", " 'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',\n", " 'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]\n", "\n", "duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)\n", "duplicates_gdf.to_file(f'CSLC-S1_duplicates_{datetime.now().strftime(\"%d-%m-%Y\")}.geojson', driver='GeoJSON')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }