Merge pull request #79 from theslavicbear/feature-3.2

Feature 3.2
theslavicbear · Jul 18, 2021 · 8ea2183 · 8ea2183
2 parents d21b1ab + 4c74d69
commit 8ea2183
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 25 deletions.
diff --git a/Ebook-Publisher.py b/Ebook-Publisher.py
@@ -33,8 +33,8 @@
 def MakeText(site):
     if type(site) is not Nhentai.Nhentai:
         published=open(wd+site.title+'.txt', 'w', encoding="utf-8")
-        published.write(site.title+'\n')
-        published.write('by '+site.author+'\n\n')
+        published.write(site.title+Common.lineEnding)
+        published.write('by '+site.author+Common.lineEnding)
         published.write(site.story)
         published.close()
 
@@ -54,11 +54,20 @@ def MakeHTML(site):
             for i in range(len(site.rawstoryhtml)):
                 published.write('<p><a href="#Chapter '+str(i)+'">'+site.chapters[i]+'</a></p>\n')
         elif not site.backwards:
+            j=0
             for i in range(len(site.rawstoryhtml)):
                 if i!=0:
-                    published.write('<p><a href="#'+str(site.depth[i-1])+'">'+str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((len(site.depth[i-1])/2)+2))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]+'</a></p>\n')
+                    if site.partial:
+                        published.write('<p><a href="#'+str(site.depth[i-1])+'">'+str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((site.partialStart+len(site.depth[i-1])/2)+1))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]+'</a></p>\n')
+                    else:
+                        published.write('<p><a href="#'+str(site.depth[i-1])+'">'+str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((len(site.depth[i-1])/2)+2))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]+'</a></p>\n')
                 else:
-                    published.write('<p><a href="#Chapter '+str(i)+'">'+'1.1 '+site.chapters[i]+'</a></p>\n')
+                    if site.partial:
+                        j=site.partialStart
+                        published.write('<p><a href="#Chapter '+str(i)+'">'+str(j)+'. '+site.chapters[i]+'</a></p>\n')
+                        j+=1
+                    else:
+                        published.write('<p><a href="#Chapter '+str(i)+'">'+'1.1 '+site.chapters[i]+'</a></p>\n')
         else:
             for i in range(len(site.rawstoryhtml)):
                 published.write('<p><a href="#Chapter '+str(i)+'">'+site.chapters[i]+'</a></p>\n')
@@ -110,7 +119,10 @@ def MakeEpub(site):
                 if i == 0:
                     c.append(epub.EpubHtml(title=site.chapters[i], file_name='Chapter '+str(i+1)+'.xhtml', lang='en'))
                 else:
-                    c.append(epub.EpubHtml(title=site.chapters[i], file_name=str(site.depth[i-1])+'.xhtml', lang='en', tocTitle=str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((len(site.depth[i-1])/2)+2))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]))
+                    if not site.partial:
+                        c.append(epub.EpubHtml(title=site.chapters[i], file_name=str(site.depth[i-1])+'.xhtml', lang='en', tocTitle=str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((len(site.depth[i-1])/2)+2))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]))
+                    else:
+                        c.append(epub.EpubHtml(title=site.chapters[i], file_name=str(site.depth[i-1])+'.xhtml', lang='en', tocTitle=str(' _'*int((len(site.depth[i-1])/2)+1))+' '+str(int((site.partialStart+len(site.depth[i-1])/2)+1))+'.'+site.depth[i-1].split('.')[-1]+' '+site.chapters[i]))
                 c[i].content='<h2>\n'+site.chapters[i]+'\n</h2>\n'+str(site.epubrawstoryhtml[i])
             elif type(site) is Nhentai.Nhentai:
                 c.append(epub.EpubHtml(title=site.chapters[i], file_name='Chapter '+str(i+1)+'.xhtml', lang='en'))
@@ -221,6 +233,8 @@ def getCSS():
 parser.add_argument('-n', '--no-duplicates', help='Skips stories if they have already been downloaded', action='store_true') 
 parser.add_argument('-s', '--css', '--style-sheet', help='either a CSS string or a .css file to use for formatting', default='')
 parser.add_argument('--chyoa-force-forwards', help='Force Chyoa stories to be scraped forwards if not given page 1', action='store_true')
+parser.add_argument('--eol', help='end of line character for .txt output format, must be enclosed in single quotes', default='\n\n')
+parser.add_argument('--chyoa-update', help='Checks if story already exists in output directory, and skips it if it has not been updated on the server since file was created.', action='store_true')
 args=parser.parse_args()
 
 #print(args.output_type)
@@ -245,6 +259,10 @@ def getCSS():
 if args.chyoa_force_forwards:
     Common.chyoa_force_forwards=True
 
+if args.chyoa_update:
+    Common.chyoaDupCheck=True
+
+Common.lineEnding=args.eol.encode('latin-1', 'backslashreplace').decode('unicode-escape')
 
 if args.directory is None:
     wd='./'

diff --git a/Site/Chyoa.py b/Site/Chyoa.py
@@ -10,6 +10,7 @@
 import queue
 import copy
 import urllib.parse
+from datetime import datetime
 lock = Lock()
 lock2 = Lock()
 
@@ -43,12 +44,14 @@ def __init__(self, url):
         self.images=[] #testing images
         self.hasimages = False
         self.duplicate = False
-        self.backwards = True
+        self.backwards = not Common.chyoa_force_forwards
         self.depth = []
         self.quiet = Common.quiet
         self.epubnextpages = []
         self.nextLinks=[]
-
+        self.partial = False
+        self.partialStart=1
+        self.ogUrl=self.url
 
         page = Common.RequestPage(url)
 
@@ -66,12 +69,33 @@ def __init__(self, url):
             except:
                 pass
 
+
+
+
+
+        elif not self.backwards:
+            self.partial = True
+
+
+        #get update timestamp:
+        if (self.backwards or self.partial) and Common.chyoaDupCheck:
+            date=soup.find('p', attrs={'class':'dates'}).strong.get_text()
+            #date='Jun 18, 2022'
+            timestamp=datetime.strptime(date, "%b %d, %Y")
+            #print(timestamp)
+            if not Common.CheckDuplicateTime(self.title, timestamp):
+                Common.prnt('Story not updated: '+self.url, f=True)
+                self.duplicate= True
+                return None
+
+        #check duplicate with timestamp
+
         if Common.dup:
             if Common.CheckDuplicate(self.title):
                 self.duplicate = True
                 return None
 
-        if self.backwards:
+        if self.backwards or self.partial:
             self.authors.insert(0,soup.find_all('a')[7].get_text())
         else:
             self.authors.insert(0,soup.find_all('a')[5].get_text())
@@ -80,7 +104,8 @@ def __init__(self, url):
 
         tmp=soup.find('p', attrs={'class': 'meta'}).get_text()
         t=[s for s in tmp.split() if s.isdigit()]
-        self.length=int(t[0])        
+        self.length=int(t[0])
+        self.partialStart=self.length
 
 
         if soup.find('form', attrs={'id':'immersion-form'}) is not None:
@@ -138,9 +163,9 @@ def __init__(self, url):
 
 
         #if soup.find('a').text.strip()==
-        self.backwards = False
+        #self.backwards = not Common.chyoa_force_forwards
         for i in soup.find_all('a'):
-            if i.text.strip()=='Previous Chapter':
+            if i.text.strip()=='Previous Chapter' and self.backwards:
                 self.AddPrevPage(i.get('href'))
                 self.backwards = True
                 break
@@ -159,8 +184,11 @@ def __init__(self, url):
                     numChapters=numChapters.replace(',','')
             try:
                 if not Common.mt:
-                    self.pbar=Common.Progress(int(numChapters))
-                    self.pbar.Update()
+                    if self.partial:
+                        print('Downloading an unknown number of pages')
+                    else:
+                        self.pbar=Common.Progress(int(numChapters))
+                        self.pbar.Update()
             except:
                 pass
 
@@ -191,14 +219,17 @@ def __init__(self, url):
             self.Pages.extend(urls)
             j=1
             for u in urls:
-                if Common.mt:
+                if Common.mt and not self.partial:
                     chapNum = int(soup.find('p', attrs={'class':'meta'}).get_text().split()[1])
                     firstLinkId=None
                     threading.Thread(target=self.ThreadAdd, args=(u, j, self.renames, self.oldnames, chapNum, '<a href="#Chapter 0">Previous Chapter</a>\n<br />', '\n<a href="'+'Chapter 1'+'.xhtml">'+'Previous Chapter'+'</a>\n<br />', self.nextLinks[j-1], firstLinkId, self.url), daemon=True).start() #TODO 
                 else:
+                    if Common.mt:
+                        Common.prnt('Warning: Cannot multithread partial Chyoa story: '+self.url+'\nUsing default method to download an unknown number of pages')
+
                     self.AddNextPage(u, j, 1, '<a href="#Chapter 0">Previous Chapter</a>\n<br />', '\n<a href="'+'Chapter 1'+'.xhtml">'+'Previous Chapter'+'</a>\n<br />', self.nextLinks[j-1], None)
                 j+=1
-            if Common.mt:
+            if Common.mt and not self.partial:
                 i = int(numChapters)-1
                 print("Pages to add: "+str(i))
                 while i >0:
@@ -272,7 +303,7 @@ def __init__(self, url):
 
 
 
-        self.story=self.story.replace('\n', '\n\n')
+        self.story=self.story.replace('\n', Common.lineEnding)
 
         for i in range(0,len(self.truestoryhttml)):
             self.rawstoryhtml[i]=BeautifulSoup(self.truestoryhttml[i], 'html.parser')
@@ -348,7 +379,8 @@ def AddNextPage(self, url, depth, prevChapNum, prevLink, epubPrevLink, currLink,
         temp='<div id="'+str(depth)+'">'+str(temp2)     
         self.questions.append(soup.find('header', attrs={'class':"question-header"}).get_text())
         temp+='<h2>'+self.questions[-1]+'</h2>\n</div>'
-        #Common.prnt(str(depth))
+        if self.partial:
+            Common.prnt(str(depth))
         j = 1
 
         nextpages=[]
@@ -414,9 +446,9 @@ def AddNextPage(self, url, depth, prevChapNum, prevLink, epubPrevLink, currLink,
             self.AddNextPage(i.get('href'), str(depth)+'.'+str(j), chapNum, currLink, epubCurrLink, nextLink, currLinkId)
 
     def ThreadAdd(self, url, depth, renames, oldnames, chapNum, currLink, epubCurrLink, nextLink, currLinkId, ogUrl):
-        if self.Pages.count(url)>1:
-            print("found issue at" + str(url))
-            return None
+        #if self.Pages.count(url)>1:
+        #    print("found issue at" + str(url))
+        #    return None
         self.Pages[self.Pages.index(url)]=(Page(url, depth, renames, oldnames, self.q, chapNum, currLink, epubCurrLink, nextLink, currLinkId, ogUrl))
 
     def addPage(self, page):

diff --git a/Site/Common.py b/Site/Common.py
@@ -1,4 +1,5 @@
 import sys, urllib, os, requests, time
+from datetime import datetime
 
 #Module contains common functions needed by sites
 
@@ -16,14 +17,16 @@
 
 dup = False
 
+chyoaDupCheck=False
+
 chyoa_force_forwards=False
 
 mt = False
 
 urlDict=  {}
 
 def prnt(out, f=False):
-    if not quiet and not f:
+    if not quiet or f:
         print(out)
 
 def imageDL(title, url, num,  size=0, pbar=None, queue=None):
@@ -52,13 +55,37 @@ def imageDL(title, url, num,  size=0, pbar=None, queue=None):
 
 
 def CheckDuplicate(title):
-    if opf == 'epub':
+    if any(x in ('epub', 'EPUB') for x in opf):
         return os.path.isfile(wd+title+'.epub')
-    elif opf == 'txt':
+    elif any(x in ('txt', 'TXT') for x in opf):
         return os.path.isfile(wd+title+'.txt') or os.path.exists(wd+title)
-    elif opf == 'html':
+    elif any(x in ('html', 'HTML') for x in opf):
         return os.path.isfile(wd+title+'.html') or os.path.exists(wd+title)
 
+def CheckDuplicateTime(title, timeObject):
+    if any(x in ('epub', 'EPUB') for x in opf):
+        if os.path.isfile(wd+title+'.epub'):
+            #print(time.ctime(os.path.getmtime(wd+title+'.epub')))
+            if timeObject > datetime.strptime(time.ctime(os.path.getmtime(wd+title+'.epub')), '%a %b %d %H:%M:%S %Y'):
+                return True
+    elif any(x in ('txt', 'TXT') for x in opf):
+        if os.path.isfile(wd+title+'.txt'):
+            if timeObject > datetime.strptime(time.ctime(os.path.getmtime(wd+title+'.txt')), '%a %b %d %H:%M:%S %Y'):
+                return True
+        elif os.path.exists(wd+title):
+            if timeObject > datetime.strptime(time.ctime(os.path.getmtime(wd+title)), '%a %b %d %H:%M:%S %Y'):
+                return True
+
+    elif any(x in ('html', 'HTML') for x in opf):
+        if os.path.isfile(wd+title+'.html'):
+            if timeObject > datetime.strptime(time.ctime(os.path.getmtime(wd+title+'.html')), '%a %b %d %H:%M:%S %Y'):
+                return True
+        elif os.path.exists(wd+title):
+            #print(datetime.strptime(time.ctime(os.path.getmtime(wd+title)), '%a %b %d %H:%M:%S %Y'))
+            if timeObject > datetime.strptime(time.ctime(os.path.getmtime(wd+title)), '%a %b %d %H:%M:%S %Y'):
+                return True
+    return False
+
 
 def GetImage(url):
     try:

diff --git a/Site/Wattpad.py b/Site/Wattpad.py
@@ -67,7 +67,7 @@ def __init__(self, url):
         for i in range(0, len(self.rawstoryhtml)):
             self.story=self.story+self.chapters[i]+'\n'
             self.story=self.story+self.rawstoryhtml[i].get_text()
-        self.story=self.story.replace('\n', '\n\n')
+        self.story=self.story.replace('\n', Common.lineEnding)
 
     def addNextPage(self, url):
         soup=BeautifulSoup(self.requestPage(url).content, 'html.parser')