Files
administration/Controllers/HelloFresh/HelloFreshScraperService.cs
2025-09-03 20:17:50 +02:00

139 lines
5.1 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using AngleSharp;
using AngleSharp.Dom;
using Newtonsoft.Json.Linq;
namespace HelloFreshScraper.Services
{
public class Recipe
{
public string Id { get; set; }
public string Name { get; set; }
public string Image { get; set; }
public string Pdf { get; set; }
public string PrepTime { get; set; } // ex: "30-35 min"
public string Difficulty { get; set; } // ex: "Intermédiaire"
public string Description { get; set; } // ex: "Accompagné de..."
public string Label { get; set; }
}
public class HelloFreshScraperService
{
private readonly HttpClient _httpClient;
private readonly IBrowsingContext _context;
public HelloFreshScraperService(HttpClient httpClient)
{
_httpClient = httpClient;
var config = Configuration.Default.WithDefaultLoader();
_context = BrowsingContext.New(config);
}
public async Task<List<Recipe>> GetRecipesAsync(string locale = "fr-fr", int startPage = 1, int pagesToLoad = 2)
{
var recipesDict = new Dictionary<string, Recipe>();
var config = Configuration.Default.WithDefaultLoader();
var context = BrowsingContext.New(config);
for (int page = startPage; page < startPage + pagesToLoad; page++)
{
var url = $"https://hfresh.info/{locale}?page={page}";
var html = await _httpClient.GetStringAsync(url);
var document = await context.OpenAsync(req => req.Content(html));
var rawData = document.QuerySelector("#app")?.GetAttribute("data-page");
if (string.IsNullOrWhiteSpace(rawData)) continue;
var parsed = JObject.Parse(rawData);
var recipeArray = parsed.SelectToken("props.recipes.data") as JArray;
foreach (var item in recipeArray ?? new JArray())
{
var id = item["id"]?.ToString();
var name = item["name"]?.ToString();
var pdf = item["pdf"]?.ToString();
if (string.IsNullOrEmpty(id) || string.IsNullOrEmpty(name) || string.IsNullOrEmpty(pdf))
continue;
if (recipesDict.ContainsKey(id)) continue; // éviter les doublons
var nameKey = name.ToLower().Trim();
if (recipesDict.Values.Any(r => r.Name.ToLower().Trim() == nameKey)) continue;
var recipe = new Recipe
{
Id = id,
Name = name,
Image = item["image"]?.ToString(),
Pdf = pdf,
Description = item["headline"]?.ToString(),
Label = item["label"]?.ToString()
};
recipesDict[id] = recipe;
}
}
// 👉 Scrape en parallèle les pages de détail
var tasks = recipesDict.Values.Select(async recipe =>
{
var slug = GenerateSlug(recipe.Name);
var url = $"https://www.hellofresh.fr/recipes/{slug}-{recipe.Id}";
try
{
var html = await _httpClient.GetStringAsync(url);
// PrepTime (Regex simple, ex: "35 minutes")
var match = Regex.Match(html, @"(\d{1,3})\s*minutes?", RegexOptions.IgnoreCase);
if (match.Success)
recipe.PrepTime = match.Groups[1].Value;
// Difficulty
var diffMatch = Regex.Match(html, @"<span[^>]*data-translation-id=[""']recipe-detail\.level-number[^>]*>([^<]+)</span>", RegexOptions.IgnoreCase);
if (diffMatch.Success)
recipe.Difficulty = diffMatch.Groups[1].Value.Trim();
}
catch (Exception ex)
{
Console.WriteLine($"❌ Erreur scraping {url} : {ex.Message}");
}
});
await Task.WhenAll(tasks); // 🧠 Attendre que tous les détails soient récupérés
return recipesDict.Values.ToList();
}
private string GenerateSlug(string name)
{
var slug = name.ToLower()
.Replace("é", "e").Replace("è", "e").Replace("ê", "e")
.Replace("à", "a").Replace("â", "a").Replace("ù", "u")
.Replace("î", "i").Replace("ô", "o").Replace("ç", "c")
.Replace("œ", "oe").Replace("&", "et")
.Replace("", "-").Replace("'", "-")
.Replace("\"", "").Replace(",", "").Replace(":", "")
.Replace("!", "").Replace("?", "").Replace("(", "").Replace(")", "")
.Replace(" ", " ").Replace(" ", "-");
while (slug.Contains("--"))
slug = slug.Replace("--", "-");
return slug.Trim('-');
}
}
}