From 810c8e7c63bd023a2a08f3fa4828d32ec6e385f4 Mon Sep 17 00:00:00 2001 From: angelsburger90 <71116188+angelsburger90@users.noreply.github.com> Date: Tue, 23 Jan 2024 23:11:16 +0800 Subject: [PATCH] Updates PCSO Scraping procedure (#25) --- Includes/Classes/Scraping/LottoPCSOScraper.cs | 41 +++++++++-- LottoDataManager.csproj | 71 +++++++------------ packages.config | 33 --------- 3 files changed, 60 insertions(+), 85 deletions(-) delete mode 100644 packages.config diff --git a/Includes/Classes/Scraping/LottoPCSOScraper.cs b/Includes/Classes/Scraping/LottoPCSOScraper.cs index ca6be70..8ade669 100644 --- a/Includes/Classes/Scraping/LottoPCSOScraper.cs +++ b/Includes/Classes/Scraping/LottoPCSOScraper.cs @@ -100,13 +100,40 @@ internal async Task GetWebsiteDOMAsync(Dictionary { var encodedContent = new FormUrlEncodedContent(parameters); CancellationTokenSource cancellationToken = new CancellationTokenSource(); - HttpResponseMessage request = await httpClient.PostAsync(webUrlToScrape, encodedContent); - cancellationToken.Token.ThrowIfCancellationRequested(); - Stream response = await request.Content.ReadAsStreamAsync(); + cancellationToken.CancelAfter(TimeSpan.FromMilliseconds(Timeout.Infinite)); + + HttpResponseMessage response = null; + httpClient.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"); + httpClient.DefaultRequestHeaders.Add("Accept", "text/html"); + httpClient.DefaultRequestHeaders.Add("Accept-Language", "en-US,en;q=0.5"); + httpClient.DefaultRequestHeaders.Add("Accept-Encoding", "deflate"); + httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive"); + httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1"); + httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Dest", "document"); + httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Mode", "navigate"); + httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Site", "none"); + httpClient.DefaultRequestHeaders.Add("Sec-Fetch-User", "?1"); + httpClient.DefaultRequestHeaders.Add("Pragma", "no-cache"); + httpClient.DefaultRequestHeaders.Add("Cache-Control", "no-cache"); + httpClient.DefaultRequestHeaders.Add("TE", "trailers"); + + if (parameters.Count <=0) + { + response = await httpClient.GetAsync(webUrlToScrape); + } + else + { + response = await httpClient.PostAsync(webUrlToScrape, encodedContent); + } + response.EnsureSuccessStatusCode(); cancellationToken.Token.ThrowIfCancellationRequested(); - HtmlParser parser = new HtmlParser(); - IHtmlDocument document = parser.ParseDocument(response); - return document; + using (var stream = response.Content.ReadAsStreamAsync().Result) + { + cancellationToken.Token.ThrowIfCancellationRequested(); + HtmlParser parser = new HtmlParser(); + IHtmlDocument document = parser.ParseDocument(stream); + return document; + } } } @@ -115,7 +142,7 @@ internal async void ScrapeWebsite(LotteryDetails lotteryDetails, Dictionary()); RaiseEvent(LottoWebScrapingStages.SESSION_CREATION); Dictionary sessionParam = GetSessionBasedParameters(lotteryDetails, documentForSession); RaiseEvent(LottoWebScrapingStages.SEARCHING_DATA); diff --git a/LottoDataManager.csproj b/LottoDataManager.csproj index 0e56690..b1e5d0d 100644 --- a/LottoDataManager.csproj +++ b/LottoDataManager.csproj @@ -1,9 +1,5 @@  - - - - Debug @@ -45,36 +41,9 @@ - - packages\TimePeriodLibrary.NET.2.1.1\lib\net46\Itenso.TimePeriod.dll - - - packages\Microsoft.Bcl.AsyncInterfaces.5.0.0\lib\net461\Microsoft.Bcl.AsyncInterfaces.dll - - - packages\Microsoft.Extensions.DependencyModel.3.1.6\lib\net451\Microsoft.Extensions.DependencyModel.dll - - - packages\Microsoft.ML.1.5.5\lib\netstandard2.0\Microsoft.ML.dll - - - packages\Microsoft.ML.LightGbm.1.5.5\lib\netstandard2.0\Microsoft.ML.LightGbm.dll - - - packages\System.Buffers.4.5.1\lib\net461\System.Buffers.dll - - - packages\System.Collections.NonGeneric.4.3.0\lib\net46\System.Collections.NonGeneric.dll - True - True - - - packages\System.Numerics.Vectors.4.4.0\lib\net46\System.Numerics.Vectors.dll - True - packages\System.Runtime.InteropServices.RuntimeInformation.4.3.0\lib\net45\System.Runtime.InteropServices.RuntimeInformation.dll @@ -474,7 +443,6 @@ - SettingsSingleFileGenerator Settings.Designer.cs @@ -612,18 +580,31 @@ AngelsRepositoryLib + + + 1.1.0 + + + 8.0.101 + + + 1.11.57 + + + 3.0.1 + + + 2.9.1 + + + 4.3.0 + + + 8.0.0 + + + 2.1.5 + + - - - This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them. For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}. - - - - - - - - - - \ No newline at end of file diff --git a/packages.config b/packages.config deleted file mode 100644 index 113e8ed..0000000 --- a/packages.config +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file