using HtmlAgilityPack;
namespace Rider_ConsoleApp;
// ReSharper disable UnusedAutoPropertyAccessor.Global
public static class FilteredV1
{
public static List> Start(string filePath, char separator)
{
//string filePath = "data/filtered.csv";
var parsedData = ParseCsv(filePath, separator);
return parsedData;
}
static List> ParseCsv(string filePath, char separator)
{
var data = new List>();
var lines = File.ReadAllLines(filePath);
//var headers = lines[0].Split('\t').Skip(1).ToArray(); // Skip the first column (Field names)
//var headers = lines[0].Split(separator).Skip(1).ToArray(); // Skip the first column (Field names)
for (int i = 0; i < lines.Length; i++)
{
var values = lines[i].Split(separator);
var fieldName = values[0]; // The field name (e.g., "Brand", "Model")
if (string.IsNullOrEmpty(fieldName))
continue;
for (int j = 1; j < values.Length; j++)
{
if (data.Count < j) data.Add(new Dictionary());
data[j - 1][fieldName] = values[j];
}
}
return data.Where(item => item.ContainsKey("Model") && string.IsNullOrEmpty(item["Model"]) == false).ToList();
}
public static async Task DownloadImageAsync(string url, string filePath)
{
using (var httpClient = new HttpClient())
{
httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
httpClient.DefaultRequestHeaders.Referrer = new Uri(url);
try
{
var response = await httpClient.GetAsync(url);
if (response.IsSuccessStatusCode)
{
using (var fileStream = new FileStream(filePath, FileMode.Create, FileAccess.Write, FileShare.None))
{
await response.Content.CopyToAsync(fileStream);
}
Console.WriteLine($"Image saved: {filePath}");
}
else
{
Console.WriteLine($"Failed to download image from {url}. Status code: {response.StatusCode}");
}
}
catch (Exception ex)
{
Console.WriteLine($"Error downloading image from {url}: {ex.Message}");
}
}
}
public static async Task HandleImageDownloads(List> list)
{
var websiteDownloadTasks = new List();
var allreadyCheckedURLs = new Dictionary();
foreach (var dictionary in list)
{
foreach (var s in dictionary)
{
if (s.Value.Contains("http", StringComparison.CurrentCultureIgnoreCase))
{
try
{
var model = dictionary["Model"];
var brand = dictionary["Brand"];
if (allreadyCheckedURLs.TryAdd(s.Value, true))
{
// Add the task to the list
websiteDownloadTasks.Add(
FilteredV1.DownloadAllImagesFromWebpageAsync(s.Value, $"data/{brand}-{model}/"));
// Check if we have reached the maximum number of concurrent tasks
if (websiteDownloadTasks.Count >= 10)
{
// Wait for any of the tasks to complete
Task completedTask = await Task.WhenAny(websiteDownloadTasks);
// Remove the completed task from the list
websiteDownloadTasks.Remove(completedTask);
}
}
}
catch
{
// ignored
}
}
}
}
// make sure all tasks are done before continuing
await Task.WhenAll(websiteDownloadTasks);
}
public static async Task DownloadAllImagesFromWebpageAsync(string webpageUrl, string downloadDirectory)
{
using (var httpClient = new HttpClient())
{
httpClient.DefaultRequestHeaders.UserAgent.ParseAdd(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
httpClient.DefaultRequestHeaders.Referrer = new Uri(webpageUrl); // Set the Referer
try
{
var response = await httpClient.GetAsync(webpageUrl);
response.EnsureSuccessStatusCode(); // Ensure we got a successful response (will throw if not 2xx)
var htmlContent = await response.Content.ReadAsStringAsync();
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(htmlContent);
// Select all
tags
var imageNodes = htmlDocument.DocumentNode.SelectNodes("//img[@src]");
if (imageNodes != null)
{
Console.WriteLine($"Found {imageNodes.Count} images on {webpageUrl}");
// Create the download directory if it doesn't exist
Directory.CreateDirectory(downloadDirectory);
foreach (var imgNode in imageNodes)
{
var imageUrl = imgNode.GetAttributeValue("src", null);
if (!string.IsNullOrEmpty(imageUrl))
{
// Make the URL absolute if it's relative
Uri absoluteUrl;
if (!Uri.IsWellFormedUriString(imageUrl, UriKind.Absolute))
{
absoluteUrl = new Uri(new Uri(webpageUrl), imageUrl);
}
else
{
absoluteUrl = new Uri(imageUrl);
}
// Extract the image file name from the URL
var fileName = Path.GetFileName(absoluteUrl.LocalPath);
var filePath = Path.Combine(downloadDirectory, fileName);
Console.WriteLine($"Downloading image from: {absoluteUrl}");
// Add User-Agent and Referer for the image request as well (important!)
using (var imageHttpClient = new HttpClient())
{
imageHttpClient.DefaultRequestHeaders.UserAgent.ParseAdd(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
imageHttpClient.DefaultRequestHeaders.Referrer = new Uri(webpageUrl);
await DownloadImageAsync(absoluteUrl.ToString(), filePath);
}
await Task.Delay(TimeSpan.FromSeconds(1)); // Be polite
}
}
}
else
{
Console.WriteLine($"No images found on {webpageUrl}");
}
}
catch (HttpRequestException ex)
{
Console.WriteLine($"Error accessing webpage {webpageUrl}: {ex.Message}");
}
catch (Exception ex)
{
Console.WriteLine($"An error occurred: {ex.Message}");
}
}
}
}