radiostasis/scripts/generate-descriptions.csx

143 lines
5.7 KiB
Text
Raw Normal View History

#!/usr/bin/env dotnet-script
#nullable enable
#r "nuget: Microsoft.Data.Sqlite, 7.0.4"
#r "nuget: OpenAI, 1.7.2"
#r "nuget: HtmlAgilityPack, 1.11.46"
/* This script loops through each series, fetches the websites specified
* by its series_links urls, then sends those to the OpenAI ChatGPT API
* and asks for a two paragraph synopsis, with the first paragraph focusing
* on a description of the show and the second focusing on air dates,
* networks, actors, and other minutia. */
using System.Net;
using System.Text.Json;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Data.Sqlite;
using OpenAI_API;
using OpenAI_API.Chat;
using OpenAI_API.Models;
using SQLitePCL;
private readonly string BASE_PATH = Path.GetFullPath("..");
private readonly string CONNECTION_STRING =
$"Data Source={Path.Combine(BASE_PATH, "db", "radiostasis.db")}";
private string configJson = File.ReadAllText(
Path.Combine(BASE_PATH, "scripts", "config.json"));
private Dictionary<string, string> config =
JsonSerializer.Deserialize<Dictionary<string, string>>(configJson)
?? new();
private readonly Regex whiteSpace = new(@"\s+", RegexOptions.Compiled);
private Dictionary<string, (string, List<string>)> GetSeriesLinks() {
var links = new Dictionary<string, (string, List<string>)>();
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"select l.series_slug, l.link_url, s.title from series_links l
inner join series s on s.series_slug=l.series_slug
where s.description is null";
using var reader = cmd.ExecuteReader();
while (reader.Read()) {
var slug = reader.GetString(0);
var url = reader.GetString(1);
var title = reader.GetString(2);
if (!links.ContainsKey(slug)) links.Add(slug, (title, new()));
links[slug].Item2.Add(url);
}
return links;
}
private void SaveDescription(string slug, string description) {
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"update series set description=@Description
where series_slug=@Slug";
var p1 = cmd.CreateParameter();
p1.ParameterName = "@Description";
p1.Value = description;
var p2 = cmd.CreateParameter();
p2.ParameterName = "@Slug";
p2.Value = slug;
cmd.Parameters.Add(p1);
cmd.Parameters.Add(p2);
cmd.ExecuteNonQuery();
}
private async Task GenerateDescriptions() {
var links = GetSeriesLinks();
var web = new HtmlWeb();
var api = new OpenAIAPI(config["openAiApiKey"]);
foreach (var slug in links.Keys) {
var sb = new StringBuilder();
sb.AppendLine(
@$"What follows are details about a radio show named ""{links[slug].Item1}"".
Please generate a two paragraph summary of the information.
The first paragraph should begin with the show's name without quotes, and focus on the show's premise, format, tone, and style. The first paragraph should not contain air dates, years, broadcast networks, or names of people involved in the show's production. The first paragraph should not describe the show as an ""old time radio show"" or mention the ""golden age of radio.""
The second paragraph can cover what years the show was broadcast, what radio networks the show aired on, how many episodes aired and are known to still exist today, interesting trivia including whether there were related TV shows or other media formats, notable advertisers and sponsors, and any notable actors and other people involved in the production if any of that information is contained in the provided details.
The summary should not include any information about the Old Time Radio Researcher Group (OTRR), or any discussion about ""collections.""
Here are the details:");
foreach (var url in links[slug].Item2) {
var doc = await web.LoadFromWebAsync(url);
var contentNode = url.Contains("en.wikipedia.org")
? doc.DocumentNode.SelectSingleNode("//div[@id='mw-content-text']")
: url.Contains("otrcat.com")
? doc.DocumentNode.SelectSingleNode("//div[@id='product_info']")
: url.Contains("archive.org")
? doc.DocumentNode.SelectSingleNode("//div[@id='descript']")
: doc.DocumentNode.SelectSingleNode("//article")
?? doc.DocumentNode.SelectSingleNode("//main")
?? doc.DocumentNode;
var content = whiteSpace.Replace(WebUtility.HtmlDecode(contentNode.InnerText), " ");
sb.AppendLine();
sb.AppendLine(content.Length > 8000 ? content[..8000] : content);
}
var chat = api.Chat.CreateConversation(new ChatRequest {
Model = Model.GPT4,
});
chat.AppendUserInput(sb.ToString());
var response = await chat.GetResponseFromChatbotAsync();
SaveDescription(slug, response);
WriteLine(response);
WriteLine("----------");
WriteLine();
}
}
private void ExportDescriptions() {
using var connection = new SqliteConnection(CONNECTION_STRING);
connection.Open();
using var cmd = connection.CreateCommand();
cmd.CommandText =
@"select 'update series set description='''
|| replace(description, '''', '''''') || '''
where series_slug=''' || series_slug || ''';'
from series where description is not null";
using var stream = File.Open(Path.Combine(
BASE_PATH, "db", "migrations", "007-descriptions.sql"), FileMode.Create);
using var sw = new StreamWriter(stream);
using var reader = cmd.ExecuteReader();
var first = true;
while (reader.Read()) {
if (first) first = false;
else sw.WriteLine();
sw.WriteLine(reader.GetString(0));
}
}
Batteries.Init();
await GenerateDescriptions();
ExportDescriptions();