feat: parse html for url
This commit is contained in:
79
HiddenUrl/Program.cs
Normal file
79
HiddenUrl/Program.cs
Normal file
@@ -0,0 +1,79 @@
|
||||
// See https://aka.ms/new-console-template for more information
|
||||
using System;
|
||||
using System.Net.Http;
|
||||
using System.Threading.Tasks;
|
||||
using HtmlAgilityPack;
|
||||
using System.Linq;
|
||||
|
||||
|
||||
var CHALLENGE_URL = "https://tns4lpgmziiypnxxzel5ss5nyu0nftol.lambda-url.us-east-1.on.aws/challenge";
|
||||
var PAGE_CACHE = "challenge.html";
|
||||
|
||||
try
|
||||
{
|
||||
string html;
|
||||
|
||||
if (File.Exists(PAGE_CACHE))
|
||||
{
|
||||
Console.WriteLine("Loading HTML from cache...");
|
||||
html = await File.ReadAllTextAsync(PAGE_CACHE);
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("Downloading HTML...");
|
||||
try
|
||||
{
|
||||
using HttpClient client = new HttpClient();
|
||||
html = await client.GetStringAsync(CHALLENGE_URL);
|
||||
|
||||
// Write to cache
|
||||
await File.WriteAllTextAsync(PAGE_CACHE, html);
|
||||
Console.WriteLine("HTML cached to file.");
|
||||
}
|
||||
catch (HttpRequestException e)
|
||||
{
|
||||
Console.WriteLine($"HTTP error: {e.Message}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
var doc = new HtmlDocument();
|
||||
doc.LoadHtml(html);
|
||||
|
||||
// Find all <section> with data-id starting with "92"
|
||||
var sections = doc.DocumentNode.SelectNodes("//section[starts-with(@data-id, '92')]");
|
||||
if (sections == null) return;
|
||||
|
||||
foreach (var section in sections)
|
||||
{
|
||||
// Find <article> children with data-class ending in "45"
|
||||
var articles = section.SelectNodes(".//article[substring(@data-class, string-length(@data-class) - 1) = '45']");
|
||||
if (articles == null) continue;
|
||||
|
||||
foreach (var article in articles)
|
||||
{
|
||||
// Find <div> children with data-tag containing "78"
|
||||
var divs = article.SelectNodes(".//div[contains(@data-tag, '78')]");
|
||||
if (divs == null) continue;
|
||||
|
||||
foreach (var div in divs)
|
||||
{
|
||||
// Find all <b> tags nested inside this div
|
||||
var bTags = div.SelectNodes(".//b[contains(concat(' ', normalize-space(@class), ' '), ' ref ')]");
|
||||
if (bTags == null) continue;
|
||||
|
||||
foreach (var b in bTags)
|
||||
{
|
||||
Console.Write(b.GetAttributeValue("value", null));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// end with newline
|
||||
Console.WriteLine();
|
||||
}
|
||||
catch
|
||||
{
|
||||
Console.WriteLine("Unable to get HTML from page!");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user