Skip to content

Commit

Permalink
Fix issues #404/#374 Refresh the cookies on auth error and try to con…
Browse files Browse the repository at this point in the history
…tinue

- Occasionally something happens to the cookies and Tumblr returns an auth error on the next request.
- In case of an auth error the cookies are fetched from the WebView instance again and the request is repeated.
- It's not totally clear yet why this happens.
  • Loading branch information
thomas694 committed Aug 26, 2023
1 parent f896e2c commit 5fe3c7b
Show file tree
Hide file tree
Showing 16 changed files with 183 additions and 41 deletions.
4 changes: 2 additions & 2 deletions src/TumblThree/SharedAssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@

[assembly: ComVisible(false)]
[assembly: NeutralResourcesLanguage("en-US", UltimateResourceFallbackLocation.MainAssembly)]
[assembly: AssemblyVersion("2.11.1.0")]
[assembly: AssemblyFileVersion("2.11.1.0")]
[assembly: AssemblyVersion("2.11.2.0")]
[assembly: AssemblyFileVersion("2.11.2.0")]
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
using Microsoft.Win32;
using System;
using System;
using System.Collections.Generic;
using System.ComponentModel.Composition;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Reflection;
using System.Threading.Tasks;
using System.Waf.Applications;
using System.Waf.Applications.Services;
Expand All @@ -20,8 +19,6 @@
using TumblThree.Applications.Views;
using TumblThree.Domain;
using TumblThree.Domain.Queue;
using TumblThree.Applications.Extensions;
using System.Reflection;

namespace TumblThree.Applications.Controllers
{
Expand Down Expand Up @@ -150,7 +147,7 @@ public void Initialize()
QueueController.Initialize();
DetailsController.Initialize();
CrawlerController.Initialize();
_cookieService.SetUriCookie(_cookieList);
_cookieService.SetUriCookie(CleanOldTumblrCookies(_cookieList));
}

public async void Run()
Expand Down Expand Up @@ -475,5 +472,14 @@ private static void InitializeCultures(AppSettings settings)
CultureInfo.CurrentUICulture = ci;
}
}

private static List<Cookie> CleanOldTumblrCookies(List<Cookie> cookies)
{
if (cookies.Exists(x => x.Name == "sid" && x.Domain == "www.tumblr.com"))
{
cookies = cookies.Where(x => x.Domain != "www.tumblr.com").ToList();
}
return cookies;
}
}
}
13 changes: 13 additions & 0 deletions src/TumblThree/TumblThree.Applications/Crawler/AbstractCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -620,5 +620,18 @@ protected bool HandleUnauthorizedWebException(WebException webException)
ShellService.ShowError(webException, Resources.PasswordProtected, Blog.Name);
return true;
}

protected bool HandleUnauthorizedWebExceptionRetry(WebException webException)
{
var resp = (HttpWebResponse)webException?.Response;
if (resp == null || resp.StatusCode != HttpStatusCode.Unauthorized)
{
return false;
}

Logger.Error("{0}, {1}", string.Format(CultureInfo.CurrentCulture, Resources.AuthErrorRetrying, Blog.Name), webException.Message);
ShellService.ShowError(webException, Resources.AuthErrorRetrying, Blog.Name);
return true;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;

using Microsoft.Web.WebView2.Core;
using Microsoft.Web.WebView2.Wpf;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using TumblThree.Applications.DataModels;
using TumblThree.Applications.DataModels.TumblrApiJson;
using TumblrSvcJson = TumblThree.Applications.DataModels.TumblrSvcJson;
Expand All @@ -16,10 +21,7 @@
using TumblThree.Applications.Services;
using TumblThree.Domain;
using TumblThree.Domain.Models.Blogs;
using System.IO;
using TumblThree.Applications.Downloader;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using TumblThree.Domain.Models;

namespace TumblThree.Applications.Crawler
Expand All @@ -31,6 +33,8 @@ public abstract class AbstractTumblrCrawler : AbstractCrawler
private static readonly Regex extractImageSize = new Regex("/s(\\d+?)x(\\d+?)[^/]*?/");

protected readonly ICrawlerDataDownloader crawlerDataDownloader;
protected readonly IEnvironmentService environmentService;
protected readonly ILoginService loginService;

public ITumblrParser TumblrParser { get; }

Expand All @@ -47,7 +51,7 @@ public abstract class AbstractTumblrCrawler : AbstractCrawler
protected AbstractTumblrCrawler(IShellService shellService, ICrawlerService crawlerService, IWebRequestFactory webRequestFactory, ISharedCookieService cookieService,
ITumblrParser tumblrParser, IImgurParser imgurParser, IGfycatParser gfycatParser, IWebmshareParser webmshareParser, IUguuParser uguuParser,
ICatBoxParser catboxParser, IPostQueue<AbstractPost> postQueue, IBlog blog, IDownloader downloader, ICrawlerDataDownloader crawlerDataDownloader,
IProgress<DownloadProgress> progress, PauseToken pt, CancellationToken ct)
IProgress<DownloadProgress> progress, IEnvironmentService environmentService, ILoginService loginService, PauseToken pt, CancellationToken ct)
: base(shellService, crawlerService, progress, webRequestFactory, cookieService, postQueue, blog, downloader, pt, ct)
{
this.crawlerDataDownloader = crawlerDataDownloader;
Expand All @@ -58,6 +62,8 @@ protected AbstractTumblrCrawler(IShellService shellService, ICrawlerService craw
this.WebmshareParser = webmshareParser;
this.UguuParser = uguuParser;
this.CatboxParser = catboxParser;
this.environmentService = environmentService;
this.loginService = loginService;
}

protected async Task<string> GetRequestAsync(string url)
Expand Down Expand Up @@ -570,6 +576,37 @@ protected static void AddDownloadedMedia(string url, string filename, TumblrSvcJ
post.DownloadedUrls.Add(url);
}

protected async Task<bool> FetchCookiesAgainAsync()
{
var appSettingsPath = Path.GetFullPath(Path.Combine(environmentService.AppSettingsPath, ".."));
CoreWebView2Environment env = await CoreWebView2Environment.CreateAsync(null, appSettingsPath);
using (WebView2 browser = new WebView2())
{
await browser.EnsureCoreWebView2Async(env);
var cookieManager = browser.CoreWebView2.CookieManager;
var cookies = await cookieManager.GetCookiesAsync("https://www.tumblr.com/");
CookieCollection cookieCollection = GetCookies(cookies);
loginService.AddCookies(cookieCollection);
}
Logger.Warning("Reloaded Tumblr cookies");
ShellService.ShowError(null, "Warning: Reloaded Tumblr cookies");
return true;
}

private static CookieCollection GetCookies(List<CoreWebView2Cookie> cookies)
{
CookieCollection cookieCollection = new CookieCollection();
foreach (var cookie in cookies)
{
var transferCookie = new System.Net.Cookie(cookie.Name, WebUtility.UrlEncode(cookie.Value), cookie.Path, cookie.Domain);
transferCookie.Expires = cookie.Expires;
transferCookie.HttpOnly = cookie.IsHttpOnly;
transferCookie.Secure = cookie.IsSecure;
cookieCollection.Add(transferCookie);
}
return cookieCollection;
}

private static DateTime GetDate(Post post)
{
return DateTime.Parse(post.DateGmt);
Expand Down
10 changes: 7 additions & 3 deletions src/TumblThree/TumblThree.Applications/Crawler/CrawlerFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,19 @@ public class CrawlerFactory : ICrawlerFactory
private readonly IShellService shellService;
private readonly ISharedCookieService cookieService;
private readonly AppSettings settings;
private readonly IEnvironmentService environmentService;
private readonly ILoginService loginService;

[ImportingConstructor]
internal CrawlerFactory(ICrawlerService crawlerService, IManagerService managerService, ShellService shellService,
ISharedCookieService cookieService)
ISharedCookieService cookieService, IEnvironmentService environmentService, ILoginService loginService)
{
this.crawlerService = crawlerService;
this.managerService = managerService;
this.shellService = shellService;
this.cookieService = cookieService;
this.environmentService = environmentService;
this.loginService = loginService;
settings = shellService.Settings;
}

Expand Down Expand Up @@ -80,14 +84,14 @@ public ICrawler GetCrawler(IBlog blog, IProgress<DownloadProgress> progress, Pau
cookieService, GetTumblrDownloader(progress, blog, files, postQueue, pt, ct),
GetJsonDownloader(jsonSvcQueue, blog, pt, ct), GetTumblrSvcJsonToTextParser(blog), GetTumblrParser(),
imgurParser, gfycatParser, GetWebmshareParser(), GetUguuParser(),
GetCatBoxParser(), postQueue, jsonSvcQueue, blog, progress, pt, ct);
GetCatBoxParser(), postQueue, jsonSvcQueue, blog, progress, environmentService, loginService, pt, ct);
case BlogTypes.tlb:
IPostQueue<CrawlerData<DataModels.TumblrSearchJson.Data>> jsonDataQueue = GetJsonQueue<DataModels.TumblrSearchJson.Data>();
return new TumblrLikedByCrawler(shellService, crawlerService, webRequestFactory,
cookieService, GetTumblrDownloader(progress, blog, files, postQueue, pt, ct), GetJsonDownloader(jsonDataQueue, blog, pt, ct),
GetTumblrApiJsonToTextParser(blog), GetTumblrParser(),
imgurParser, gfycatParser, GetWebmshareParser(), GetUguuParser(),
GetCatBoxParser(), postQueue, jsonDataQueue, blog, progress, pt, ct);
GetCatBoxParser(), postQueue, jsonDataQueue, blog, progress, environmentService, loginService, pt, ct);
case BlogTypes.tumblrsearch:
IPostQueue<CrawlerData<string>> jsonQueue = GetJsonQueue<string>();
return new TumblrSearchCrawler(shellService, crawlerService, webRequestFactory,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ public TumblrBlogCrawler(IShellService shellService, ICrawlerService crawlerServ
IPostQueue<AbstractPost> postQueue, IPostQueue<CrawlerData<Post>> jsonQueue, IBlog blog,
IProgress<DownloadProgress> progress, PauseToken pt, CancellationToken ct)
: base(shellService, crawlerService, webRequestFactory, cookieService, tumblrParser, imgurParser, gfycatParser,
webmshareParser, uguuParser, catboxParser, postQueue, blog, downloader, crawlerDataDownloader, progress, pt, ct)
webmshareParser, uguuParser, catboxParser, postQueue, blog, downloader, crawlerDataDownloader,
progress, null, null, pt, ct)
{
this.downloader = downloader;
this.downloader.ChangeCancellationToken(Ct);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,11 @@ public TumblrHiddenCrawler(IShellService shellService, ICrawlerService crawlerSe
ISharedCookieService cookieService, IDownloader downloader, ICrawlerDataDownloader crawlerDataDownloader,
ITumblrToTextParser<Post> tumblrJsonParser, ITumblrParser tumblrParser, IImgurParser imgurParser,
IGfycatParser gfycatParser, IWebmshareParser webmshareParser, IUguuParser uguuParser, ICatBoxParser catboxParser,
IPostQueue<AbstractPost> postQueue, IPostQueue<CrawlerData<Post>> jsonQueue, IBlog blog, IProgress<DownloadProgress> progress, PauseToken pt, CancellationToken ct)
IPostQueue<AbstractPost> postQueue, IPostQueue<CrawlerData<Post>> jsonQueue, IBlog blog, IProgress<DownloadProgress> progress,
IEnvironmentService environmentService, ILoginService loginService, PauseToken pt, CancellationToken ct)
: base(shellService, crawlerService, webRequestFactory, cookieService, tumblrParser, imgurParser, gfycatParser,
webmshareParser, uguuParser, catboxParser, postQueue, blog, downloader, crawlerDataDownloader, progress, pt,
ct)
webmshareParser, uguuParser, catboxParser, postQueue, blog, downloader, crawlerDataDownloader,
progress, environmentService, loginService, pt, ct)
{
this.downloader = downloader;
this.tumblrJsonParser = tumblrJsonParser;
Expand Down Expand Up @@ -211,13 +212,30 @@ private async Task CrawlPageAsync(int pageNumber)
{
try
{
string document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize * pageNumber).ToString());
string document = null;
try
{
document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize * pageNumber).ToString());
}
catch (WebException webEx)
{
if (HandleUnauthorizedWebExceptionRetry(webEx))
{
await FetchCookiesAgainAsync();
document = await GetSvcPageAsync(Blog.PageSize.ToString(), (Blog.PageSize * pageNumber).ToString());
}
else
{
throw;
}
}
var response = ConvertJsonToClass<TumblrJson>(document);
await AddUrlsToDownloadListAsync(response, pageNumber);
}
catch (WebException webException)
{
if (HandleLimitExceededWebException(webException))
if (HandleLimitExceededWebException(webException) ||
HandleUnauthorizedWebExceptionRetry(webException))
{
incompleteCrawl = true;
}
Expand Down Expand Up @@ -252,6 +270,16 @@ private async Task<ulong> GetHighestPostIdAsync()
}

HandleLimitExceededWebException(webException);
if (HandleUnauthorizedWebExceptionRetry(webException))
{
await FetchCookiesAgainAsync();
try
{
return await GetHighestPostIdCoreAsync();
}
catch (WebException)
{ }
}
return lastId;
}
catch (TimeoutException timeoutException)
Expand Down Expand Up @@ -349,7 +377,8 @@ protected virtual async Task<string> RequestDataAsync(string limit, string offse
CookieService.GetUriCookie(request.CookieContainer, new Uri("https://www.tumblr.com/"));
CookieService.GetUriCookie(request.CookieContainer, new Uri("https://" + Blog.Name.Replace("+", "-") + ".tumblr.com"));
requestRegistration = Ct.Register(() => request.Abort());
return await WebRequestFactory.ReadRequestToEndAsync(request, true);
string response = await WebRequestFactory.ReadRequestToEndAsync(request, true);
return response;
}
finally
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ public TumblrLikedByCrawler(IShellService shellService, ICrawlerService crawlerS
ISharedCookieService cookieService, IDownloader downloader, ICrawlerDataDownloader crawlerDataDownloader,
ITumblrToTextParser<Post> tumblrJsonParser, ITumblrParser tumblrParser, IImgurParser imgurParser,
IGfycatParser gfycatParser, IWebmshareParser webmshareParser, IUguuParser uguuParser, ICatBoxParser catboxParser,
IPostQueue<AbstractPost> postQueue, IPostQueue<CrawlerData<DataModels.TumblrSearchJson.Data>> jsonQueue, IBlog blog, IProgress<DownloadProgress> progress, PauseToken pt, CancellationToken ct)
IPostQueue<AbstractPost> postQueue, IPostQueue<CrawlerData<DataModels.TumblrSearchJson.Data>> jsonQueue, IBlog blog, IProgress<DownloadProgress> progress,
IEnvironmentService environmentService, ILoginService loginService, PauseToken pt, CancellationToken ct)
: base(shellService, crawlerService, webRequestFactory, cookieService, tumblrParser, imgurParser, gfycatParser,
webmshareParser, uguuParser, catboxParser, postQueue, blog, downloader, crawlerDataDownloader,
progress, pt, ct)
progress, environmentService, loginService, pt, ct)
{
this.downloader = downloader;
this.tumblrJsonParser = tumblrJsonParser;
Expand Down Expand Up @@ -160,7 +161,22 @@ private async Task CrawlPageAsync(int crawlerNumber)
string document = "";
try
{
document = await GetRequestAsync(url);
try
{
document = await GetRequestAsync(url);
}
catch (WebException webEx)
{
if (HandleUnauthorizedWebException(webEx))
{
await FetchCookiesAgainAsync();
document = await GetRequestAsync(url);
}
else
{
throw;
}
}
if (!isLikesUrl)
{
document = Regex.Unescape(document);
Expand Down Expand Up @@ -322,6 +338,10 @@ private bool PostWithinTimespan(DataModels.TumblrSearchJson.Data post)
private static List<DataModels.TumblrSearchJson.Data> ExtractPosts(string document)
{
var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted))
{
Logger.Verbose("TumblrLikedByCrawler:ExtractPosts: data not found inside: \n{0}", document);
}
dynamic obj = JsonConvert.DeserializeObject(extracted);
var likedPosts = obj.Likes.likedPosts;
extracted = JsonConvert.SerializeObject(likedPosts);
Expand All @@ -337,6 +357,7 @@ private async Task DownloadPage(List<DataModels.TumblrSearchJson.Data> posts)
CheckIfShouldPause();
if (!PostWithinTimespan(post)) { continue; }

Logger.Verbose("TumblrLikedByCrawler.DownloadPage: {0}", post.PostUrl);
try
{
Post data = null;
Expand Down Expand Up @@ -648,9 +669,17 @@ private async Task<bool> CheckIfLoggedInAsync()
{
var url = Blog.Url + (TumblrLikedByBlog.IsLikesUrl(Blog.Url) ? "" : "/page/1");
string document = await GetRequestAsync(url);
if (string.IsNullOrEmpty(document))
{
Logger.Verbose("TumblrLikedByCrawler:CheckIfLoggedInAsync: empty response!");
}
if (document.Contains("___INITIAL_STATE___"))
{
var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted))
{
Logger.Verbose("TumblrLikedByCrawler:CheckIfLoggedInAsync: data not found inside: \n{0}", document);
}
dynamic obj = JsonConvert.DeserializeObject<ExpandoObject>(extracted);
var loggedIn = obj?.isLoggedIn?.isLoggedIn ?? false;
return loggedIn;
Expand Down
Loading

0 comments on commit 5fe3c7b

Please sign in to comment.