Skip to content

Commit

Permalink
128: Added website crawling feature
Browse files Browse the repository at this point in the history
  • Loading branch information
jarmatys committed Dec 27, 2024
1 parent f65ca90 commit 0800c3c
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ namespace ASSISTENTE.Infrastructure.Firecrawl.Contracts;
public interface IFirecrawlService
{
Task<Result<string>> ScrapeAsync(string website);
Task<Result<string>> InitCrawlAsync(string website);
Task<Result<List<PageDetails>>> CrawlResultAsync(string jobId);
}
36 changes: 36 additions & 0 deletions API/ASSISTENTE.Infrastructure.Firecrawl/Contracts/PageDetails.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using CSharpFunctionalExtensions;

namespace ASSISTENTE.Infrastructure.Firecrawl.Contracts;

public class PageDetails
{
private PageDetails(string url, string title, string content, string links)
{
Url = url;
Title = title;
Content = content;
Links = links;
}

public string Url { get; }
public string Title { get; }
public string Content { get; }
public string Links { get; }

public static Result<PageDetails> Create(string? url, string? title, string? content, string? links)
{
if (string.IsNullOrEmpty(url))
return Result.Failure<PageDetails>("Url is required.");

if (string.IsNullOrEmpty(title))
return Result.Failure<PageDetails>("Title is required.");

if (string.IsNullOrEmpty(content))
return Result.Failure<PageDetails>("Content is required.");

if (string.IsNullOrEmpty(links))
return Result.Failure<PageDetails>("Links are required.");

return new PageDetails(url, title, content, links);
}
}
44 changes: 43 additions & 1 deletion API/ASSISTENTE.Infrastructure.Firecrawl/FirecrawlService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,52 @@ public async Task<Result<string>> ScrapeAsync(string website)
return Result.Failure<string>(response.Warning);

var markdown = response.Data?.Markdown;

if (string.IsNullOrEmpty(markdown))
return Result.Failure<string>("No markdown content found.");

return Result.Success(markdown);
}

public async Task<Result<string>> InitCrawlAsync(string website)
{
var crawlJob = await client.Crawling.CrawlUrlsAsync(website);

if (crawlJob.JobId == null)
return Result.Failure<string>("Crawl job failed to start.");

var crawlResult = await client.Crawl.WaitJobAsync(crawlJob.JobId);

if (crawlResult.Data == null)
return Result.Failure<string>("Crawl job failed to complete.");

var urls = crawlResult.Data?.Select(x => x.Url);

return Result.Success("");
}

public async Task<Result<List<PageDetails>>> CrawlResultAsync(string jobId)
{
var crawlJob = await client.Crawl.GetCrawlStatusAsync("cf7e3926-722f-48c8-99b0-e30763332f88");

if (crawlJob.Data == null || crawlJob.Status != "completed")
return Result.Failure<List<PageDetails>>("Crawl job failed to complete.");

var pagesDetails = crawlJob.Data.Select(x =>
PageDetails.Create(
url: x.Metadata?.SourceURL,
title: x.Metadata?.Title,
content: x.Markdown,
links: x.AdditionalProperties["linksOnPage"].ToString() ?? string.Empty
)
)
.ToList();

if (pagesDetails.Any(x => x.IsFailure))
return Result.Failure<List<PageDetails>>("Failed to create page details.");

var result = pagesDetails.Select(x => x.Value).ToList();

return Result.Success(result);
}
}
2 changes: 1 addition & 1 deletion API/ASSISTENTE.Playground/Playground.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public async Task LearnAsync()

public async Task RunAsync()
{
var result = await week4.Task_02()
var result = await week4.Task_03()
.Tap(result => logger.LogInformation(result))
.TapError(error => logger.LogError(error));
}
Expand Down
101 changes: 99 additions & 2 deletions API/ASSISTENTE.Playground/Tasks/Week4.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System.Net.Http.Json;
using System.Text;
using System.Text.Json;
using ASSISTENTE.Infrastructure.Audio.Contracts;
Expand Down Expand Up @@ -27,6 +28,8 @@ public class Week4(
INeo4JService neo4JService,
IQdrantService qdrantService) : TaskBase(httpClient)
{
private readonly HttpClient _httpClient = httpClient;

public async Task<Result<string>> Task_01()
{
// TODO: Prompt wybierający metody do rozwiązania zadania
Expand Down Expand Up @@ -136,9 +139,9 @@ string ProcessFineTuningModels(List<string> incorrectList, List<string> correctL

return text.ToString();
}

static List<FineTuningModel> GenerateFineTuningModels(
IEnumerable<string> records,
IEnumerable<string> records,
string systemPrompt,
string assistantResponse)
{
Expand Down Expand Up @@ -174,4 +177,98 @@ static List<FineTuningModel> GenerateFineTuningModels(
return fineTuningModels;
}
}

public async Task<Result<string>> Task_03()
{
const string url = $"https://centrala.ag3nts.org/data/{ApiKey}/softo.json";

var softoData = await _httpClient.GetStringAsync(url);

var questions = JsonSerializer.Deserialize<Dictionary<string, string>>(softoData);

const string crawlUrl = "https://softo.ag3nts.org";

// var initCrawl = await firecrawlService.InitCrawlAsync(crawlUrl);

var pagesDetails = await firecrawlService.CrawlResultAsync("cf7e3926-722f-48c8-99b0-e30763332f88")
.GetValueOrDefault(x => x);

var sources = new StringBuilder();

foreach (var page in pagesDetails!)
{
sources.AppendLine($"<ŹRÓDŁO>{page.Url}</ŹRÓDŁO>");
}

var result = new Dictionary<string, string>();

foreach (var question in questions!)
{
var masterPrompt = $"""
Twoim jest wybranie źródeł informacji, które pomogą odpowiedzieć na pytanie:
<PYTANIE>
{question.Value}
</PYTANIE>
Żródła informacji:
{sources}
<ZASADY>
1. Zwróć tylko i wyłącznie adresy URL, według przykładu
2. Możesz zwrócić więcej niż 1 adres, jeżeli uważasz, że dana strona może zawierać niezbędne informacje
<ZASADY>
<PRZYKŁAD>
https://softo.ag3nts.org
<PRZYKŁAD>
<PRZYKŁAD>
https://softo.ag3nts.org/porfolio, https://softo.ag3nts.org/about, https://softo.ag3nts.org/contact
<PRZYKŁAD>
""";

var sourcesToVerify = await Prompt.Create(masterPrompt)
.Bind(async prompt => await llmClient.GenerateAnswer(prompt))
.GetValueOrDefault(x => x.Text);

var urlsToCheck = sourcesToVerify!.Split(",").Select(x => x.Trim());

var context = new StringBuilder();

pagesDetails.ForEach(page =>
{
if (!urlsToCheck.Contains(page.Url)) return;

context.AppendLine($"<ŹRÓDŁO>{page.Url}</ŹRÓDŁO>");
context.AppendLine($"<TYTUŁ>{page.Title}</TYTUŁ>");
context.AppendLine($"<ZAWARTOŚĆ>{page.Content}</ZAWARTOŚĆ>");
context.AppendLine($"<LINKI>{page.Links}</LINKI>");
});

var answerPrompt = $"""
Odpowiedz na pytanie na podstawie zebranych informacji i zwróć odpowiedź w formie krótkiej i zwięzłej.
<PYTANIE>
{question.Value}
</PYTANIE>
<ZASADY>
1. Zwróć tylko i wyłącznie odpowiedź na pytanie, bez dodatkowych informacji i zdań
2. Bądź bardzo konkretny i zwięzły
</ZASADY>
<KONTEKST>
{context}
</KONTEKST>
""";

var answer = await Prompt.Create(answerPrompt)
.Bind(async prompt => await llmClient.GenerateAnswer(prompt))
.GetValueOrDefault(x => x.Text);


result.Add(question.Key, answer!);
}

return await ReportResult("softo", taskResult: result);
}
}

0 comments on commit 0800c3c

Please sign in to comment.