radiostasis/scripts/generate-metadata.csx

173 lines
6.6 KiB
C#
Executable File

#!/usr/bin/env dotnet-script
#nullable enable
#r "nuget: Microsoft.Data.Sqlite, 7.0.4"
#r "nuget: OpenAI, 1.7.2"
#r "nuget: HtmlAgilityPack, 1.11.46"
/* This script loops through each series, fetches the websites specified
* by its series_links urls, then sends those to the OpenAI ChatGPT API
* and asks for structured metadata about each one:
* - a list of keywords for searching
* - the broadcast year(s)
* - a list of actors */
using System.Net;
using System.Text.Json;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Data.Sqlite;
using OpenAI_API;
using OpenAI_API.Chat;
using OpenAI_API.Models;
using SQLitePCL;
public class MetadataResponse {
public string? Actors { get; set; }
public int? MinYear { get; set; }
public int? MaxYear { get; set; }
}
private readonly string BASE_PATH = Path.GetFullPath("..");
private readonly string CONNECTION_STRING =
$"Data Source={Path.Combine(BASE_PATH, "db", "radiostasis.db")}";
private string configJson = File.ReadAllText(
Path.Combine(BASE_PATH, "scripts", "config.json"));
private Dictionary<string, string> config =
JsonSerializer.Deserialize<Dictionary<string, string>>(configJson)
?? new();
private readonly Regex whiteSpace = new(@"\s+", RegexOptions.Compiled);
private Dictionary<string, (string, List<string>)> GetSeriesLinks() {
var links = new Dictionary<string, (string, List<string>)>();
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"select l.series_slug, l.link_url, s.title from series_links l
inner join series s on s.series_slug=l.series_slug
where s.actors is null";
using var reader = cmd.ExecuteReader();
while (reader.Read()) {
var slug = reader.GetString(0);
var url = reader.GetString(1);
var title = reader.GetString(2);
if (!links.ContainsKey(slug)) links.Add(slug, (title, new()));
links[slug].Item2.Add(url);
}
return links;
}
private void SaveMetadata(string slug, string json) {
var metadata = JsonSerializer.Deserialize<MetadataResponse>(json);
if (metadata == null) return;
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"update series set
actors=@Actors,
min_year = coalesce(@MinYear, min_year),
max_year = coalesce(@MaxYear, max_year)
where series_slug=@Slug";
var p1 = cmd.CreateParameter();
p1.ParameterName = "@Slug";
p1.Value = slug;
var p2 = cmd.CreateParameter();
p2.ParameterName = "@Actors";
p2.Value = metadata.Actors != null ? metadata.Actors : DBNull.Value;
var p3 = cmd.CreateParameter();
p3.ParameterName = "@MinYear";
p3.Value = metadata.MinYear.HasValue ? metadata.MinYear : DBNull.Value;
var p4 = cmd.CreateParameter();
p4.ParameterName = "@MaxYear";
p4.Value = metadata.MaxYear.HasValue ? metadata.MaxYear : DBNull.Value;
cmd.Parameters.Add(p1);
cmd.Parameters.Add(p2);
cmd.Parameters.Add(p3);
cmd.Parameters.Add(p4);
cmd.ExecuteNonQuery();
}
private async Task GenerateMetadata() {
var links = GetSeriesLinks();
var web = new HtmlWeb();
var api = new OpenAIAPI(config["openAiApiKey"]);
foreach (var slug in links.Keys) {
var sb = new StringBuilder();
sb.AppendLine(
@$"What follows are details about a radio show named ""{links[slug].Item1}"".
Please generate a JSON response that contains three properties.
The first property should be named `Actors` and its value should be a string (NOT an array of strings) containing distinct comma-separated names of actors who performed in the show. It should contain only the names of the actors, excluding information about what role they played. If the details do not provide information about actors then this property should be `null`.
The second property should be named `MinYear` and its value should be the first year that the show was broadcast as an integer, if that information is available in the details. If that information is not available in the details then this should be `null`.
The third property should be named `MaxYear` and its value should be the final year that the show was broadcast as an integer, if that information is available in the details. If that information is not available in the details then this should be `null`.
Your entire response will be parsed directly as JSON so make sure it is formatted and escaped correctly.
Here are the details:");
foreach (var url in links[slug].Item2) {
var doc = await web.LoadFromWebAsync(url);
var contentNode = url.Contains("en.wikipedia.org")
? doc.DocumentNode.SelectSingleNode("//div[@id='mw-content-text']")
: url.Contains("otrcat.com")
? doc.DocumentNode.SelectSingleNode("//div[@id='product_info']")
: url.Contains("archive.org")
? doc.DocumentNode.SelectSingleNode("//div[@id='descript']")
: doc.DocumentNode.SelectSingleNode("//article")
?? doc.DocumentNode.SelectSingleNode("//main")
?? doc.DocumentNode;
var content = whiteSpace.Replace(WebUtility.HtmlDecode(contentNode.InnerText), " ");
sb.AppendLine();
sb.AppendLine(content.Length > 4000 ? content[..4000] : content);
}
var chat = api.Chat.CreateConversation(new ChatRequest {
Model = Model.ChatGPTTurbo,
});
chat.AppendUserInput(sb.ToString());
var response = (await chat.GetResponseFromChatbotAsync())
.Replace(@"""null""", "null");
WriteLine(response);
SaveMetadata(slug, response);
}
}
private void ExportMetadata() {
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"select series_slug, actors, min_year, max_year
from series";
using var reader = cmd.ExecuteReader();
var sb = new StringBuilder();
var first = true;
while (reader.Read()) {
if (first) first = false;
else sb.AppendLine();
var slug = reader.GetString(0);
var actors = reader.IsDBNull(1)
? "null"
: "'" + reader.GetString(1).Replace("'", "''") + "'";
int? min = reader.IsDBNull(2) ? null : reader.GetInt32(2);
int? max = reader.IsDBNull(3) ? null : reader.GetInt32(3);
sb.AppendLine(@$"update series set
actors={actors},
min_year={min},
max_year={max}
where series_slug='{slug}';");
}
File.WriteAllText(Path.Combine(
BASE_PATH, "db", "migrations", "009-metadata.sql"),
sb.ToString());
}
Batteries.Init();
//await GenerateMetadata();
ExportMetadata();