From f14fe9d3aa93ec65d2b841ff9fa41cf12296ebc2 Mon Sep 17 00:00:00 2001
From: Jim Miller
Date: Fri, 26 Apr 2024 12:24:10 -0500
Subject: [PATCH] adapter_literotica: Rewrite(mostly) for site changes. #1058
---
calibre-plugin/plugin-defaults.ini | 13 +-
fanficfare/adapters/adapter_literotica.py | 506 ++++++----------------
fanficfare/defaults.ini | 13 +-
3 files changed, 147 insertions(+), 385 deletions(-)
diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index 662c9cb3f..a9c9420ae 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -2241,10 +2241,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
[literotica.com]
use_basic_cache:true
user_agent:
-extra_valid_entries:eroticatags,averrating
+extra_valid_entries:eroticatags
eroticatags_label:Erotica Tags
-averrating_label:Average Rating
-extra_titlepage_entries:eroticatags,averrating
+extra_titlepage_entries:eroticatags
## Extract more erotica_tags from the meta tag of each chapter
use_meta_keywords: true
@@ -2267,14 +2266,6 @@ clean_chapter_titles: false
## Add the chapter description at the start of each chapter.
description_in_chapter: false
-## Force chapters in a story to be sorted by date instead of the order
-## given by the author. Used to be the default for literotica.
-## Note that FFF normalizes literotica.com story URLs to the first
-## chapter URL. If the first chapter is not the same by date and by
-## list, you may need to set order_chapters_by_date under *both*
-## [storyURL] sections.
-order_chapters_by_date:false
-
## Clear FanFiction from defaults, site is original fiction.
extratags:Erotica
diff --git a/fanficfare/adapters/adapter_literotica.py b/fanficfare/adapters/adapter_literotica.py
index d7527c101..840d36846 100644
--- a/fanficfare/adapters/adapter_literotica.py
+++ b/fanficfare/adapters/adapter_literotica.py
@@ -47,7 +47,6 @@ def __init__(self, config, url):
# where first chapter doesn't have '-ch-'.
# Now just rely on extractChapterUrlsAndMetadata to reset
# storyId to first chapter link.
- storyId = self.parsedUrl.path.split('/',)[2]
## DON'T normalize to www.literotica.com--keep for language,
## which will be set in _setURL(url). Also, multi-chapter
@@ -66,7 +65,7 @@ def __init__(self, config, url):
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
- self.dateformat = "%m/%d/%y"
+ self.dateformat = "%m/%d/%Y"
@staticmethod
def getSiteDomain():
@@ -78,11 +77,12 @@ def getAcceptDomains(cls):
@classmethod
def getSiteExampleURLs(cls):
- return "http://www.literotica.com/s/story-title https://www.literotica.com/s/story-title https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
+ return "http://www.literotica.com/s/story-title https://www.literotica.com/series/se/9999999 https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
def getSiteURLPattern(self):
+ # also https://www.literotica.com/series/se/80075773
# /s/ for story, /i/ for image/comic, /p/ for poem
- return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/(beta/)?[sip]/([a-zA-Z0-9_-]+)"
+ return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/((beta/)?[sip]/([a-zA-Z0-9_-]+)|series/se/(?P[0-9]+))"
def _setURL(self,url):
# logger.debug("set URL:%s"%url)
@@ -91,349 +91,166 @@ def _setURL(self,url):
lang = m.group('lang')
if lang not in ('www','other'):
self.story.setMetadata('language',lang.capitalize())
+ # reset storyId
+ self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[-1])
# logger.debug("language:%s"%self.story.getMetadata('language'))
- def getCategories(self, soup):
+ def parseMetaEroticaTags(self, soup):
if self.getConfig("use_meta_keywords"):
- categories = soup.find("meta", {"name":"keywords"})['content'].split(',')
- categories = [c for c in categories if not self.story.getMetadata('title') in c]
- if self.story.getMetadata('author') in categories:
- categories.remove(self.story.getMetadata('author'))
- # logger.debug("Meta = %s" % categories)
- for category in categories:
- # logger.debug("\tCategory=%s" % category)
-# self.story.addToList('category', category.title())
- self.story.addToList('eroticatags', category.title())
+ tags = soup.find("meta", {"name":"keywords"})['content'].split(',')
+ tags = [t for t in tags if not self.story.getMetadata('title') in t]
+ if self.story.getMetadata('author') in tags:
+ tags.remove(self.story.getMetadata('author'))
+ for tag in tags:
+ self.story.addToList('eroticatags', tag.title())
def extractChapterUrlsAndMetadata(self):
"""
- NOTE: Some stories can have versions,
- e.g. /my-story-ch-05-version-10
- NOTE: If two stories share the same title, a running index is added,
- e.g.: /my-story-ch-02-1
- Strategy:
- * Go to author's page, search for the current story link,
- * If it's in a tr.root-story => One-part story
- * , get metadata and be done
- * If it's in a tr.sl => Chapter in series
- * Search up from there until we find a tr.ser-ttl (this is the
- story)
- * Gather metadata
- * Search down from there for all tr.sl until the next
- tr.ser-ttl, foreach
- * Chapter link is there
+ In April 2024, site introduced significant changes, including
+ adding a 'Story Series' page and link to it in each chapter.
+ But not all stories, one-shots don't have 'Story Series'.
+
+ literotica has 'Story Series' & 'Story'. FFF calls them 'Story' & 'Chapters'
+ See https://github.com/JimmXinu/FanFicFare/issues/1058#issuecomment-2078490037
+
+ So /series/se/ will be the story URL for multi chapters but
+ keep individual 'chapter' URL for one-shots.
"""
+ logger.debug("Chapter/Story URL: <%s> " % self.url)
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
- # logger.debug("Chapter/Story URL: <%s> " % self.url)
-
- (data1,rurl) = self.get_request_redirected(self.url)
+ (data,rurl) = self.get_request_redirected(self.url)
+ # logger.debug(data)
## for language domains
self._setURL(rurl)
logger.debug("set opened url:%s"%self.url)
- soup1 = self.make_soup(data1)
- #strip comments from soup
- [comment.extract() for comment in soup1.findAll(string=lambda text:isinstance(text, Comment))]
+ soup = self.make_soup(data)
- if "This submission is awaiting moderator's approval" in data1:
+ if "This submission is awaiting moderator's approval" in data:
raise exceptions.StoryDoesNotExist("This submission is awaiting moderator's approval. %s"%self.url)
+ ## not series URL, assumed to be a chapter. Look for Story
+ ## Info block of post-beta page. I don't think it should happen?
+ if '/series/se' not in self.url:
+ if not soup.select_one('div.page__aside'):
+ raise exceptions.FailedToDownload("Missing Story Info block, Beta turned off?")
+
+ storyseriestag = soup.select_one('a.bn_av')
+ # logger.debug("Story Series Tag:%s"%storyseriestag)
+
+ if storyseriestag:
+ self._setURL(storyseriestag['href'])
+ data = self.get_request(storyseriestag['href'])
+ # logger.debug(data)
+ soup = self.make_soup(data)
+ # logger.debug(soup)
+ else:
+ logger.debug("One-shot")
+
+ isSingleStory = '/series/se' not in self.url
+
+ ## common between one-shots and multi-chapters
+
+ # title
+ self.story.setMetadata('title', stripHTML(soup.select_one('h1')))
+ # logger.debug(self.story.getMetadata('title'))
+
# author
- authora = soup1.find("a", class_="y_eU")
+ ## XXX This is still the author URL like:
+ ## https://www.literotica.com/stories/memberpage.php?uid=999999&page=submissions
+ ## because that's what's on the page. It redirects to the /authors/ page.
+ ## Only way I know right now to get the /authors/ is to make
+ ## the req and look at the redirect.
+ ## Should change to /authors/ if/when it starts appearing.
+ ## Assuming it's in the same place.
+ authora = soup.find("a", class_="y_eU")
authorurl = authora['href']
- # logger.debug(authora)
- # logger.debug(authorurl)
- self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
if authorurl.startswith('//'):
authorurl = self.parsedUrl.scheme+':'+authorurl
+ # logger.debug(authora)
+ # logger.debug(authorurl)
+ self.story.setMetadata('author', stripHTML(authora))
self.story.setMetadata('authorUrl', authorurl)
- self.story.setMetadata('author', authora.text)
-
- # get the author page
- if '/authors/' in authorurl and '/works/' not in authorurl:
- authorurl = authorurl + '/works/stories'
- ## Apr2024 site is redirecting memberpage.php to /authors/ for some users
- (dataAuth, rurl) = self.get_request_redirected(authorurl)
- if rurl:
- logger.debug("Author url(%s) redirected to (%s)"%(authorurl,rurl))
- if '/authors/' in rurl and '/works/' not in rurl:
- authorurl = rurl + '/works/stories'
- dataAuth = self.get_request(authorurl)
- # logger.debug(dataAuth)
- soupAuth = self.make_soup(dataAuth)
- #strip comments from soup
- [comment.extract() for comment in soupAuth.findAll(string=lambda text:isinstance(text, Comment))]
-# logger.debug(soupAuth)
-
- if '/authors/' in authorurl:
- return self.new_metadata(soup1, authorurl, soupAuth)
- else:
- return self.old_metadata(soup1, authorurl, soupAuth)
-
- def old_metadata(self, soup1, authorurl, soupAuth):
-
- ## Find link to url in author's page
- ## site has started using //domain.name/asdf urls remove https?: from front
- ## site has started putting https back on again.
- ## site is now using language specific german.lit... etc on author pages.
- ## site is now back to using www.lit... etc on author pages.
- ## allow for /i/ /p/ /s/ by using .com/ +4 instead of /s/
- search_url_re = r"https?://"+LANG_RE+r"(\.i)?\." + re.escape(self.getSiteDomain()) + self.url[self.url.index('.com/')+4:]+r"$"
- # logger.debug(search_url_re)
- storyLink = soupAuth.find('a', href=re.compile(search_url_re))
-# storyLink = soupAuth.find('a', href=re.compile(r'.*literotica.com/s/'+re.escape(self.story.getMetadata('storyId')) ))
-# storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:]).replace(r'www',r'[^\.]+') ))
-# storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])
-
- if storyLink is not None:
- # pull the published date from the author page
- # default values from single link. Updated below if multiple chapter.
- # logger.debug("Found story on the author page.")
- date = storyLink.parent.parent.findAll('td')[-1].text
- self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
- self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
-
- if storyLink is not None:
- urlTr = storyLink.parent.parent
- if "sl" in urlTr['class']:
- isSingleStory = False
- else:
- isSingleStory = True
- else:
- raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))
+ if '?' in authorurl:
+ self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
+ elif '/authors/' in authorurl:
+ self.story.setMetadata('authorId', authorurl.split('/')[-1])
+ else: # if all else fails
+ self.story.setMetadata('authorId', stripHTML(authora))
- if isSingleStory:
- self.story.setMetadata('title', storyLink.text.strip('/'))
- # logger.debug('Title: "%s"' % storyLink.text.strip('/'))
- self.setDescription(authorurl, urlTr.findAll("td")[1].text)
- self.story.addToList('category', urlTr.findAll("td")[2].text)
-# self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
- date = urlTr.findAll('td')[-1].text
- self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
- self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
- self.add_chapter(storyLink.text, self.url)
- averrating = stripHTML(storyLink.parent)
- ## title (0.00)
- averrating = averrating[averrating.rfind('(')+1:averrating.rfind(')')]
- try:
- self.story.setMetadata('averrating', float(averrating))
- except:
- pass
-# self.story.setMetadata('averrating',averrating)
- # parse out the list of chapters
- else:
- seriesTr = urlTr.previousSibling
- while 'ser-ttl' not in seriesTr['class']:
- seriesTr = seriesTr.previousSibling
- m = re.match(r"^(?P.*?):\s(?P\d+)\sPart\sSeries$", seriesTr.find("strong").text)
- self.story.setMetadata('title', m.group('title'))
- seriesTitle = m.group('title')
-
- ## Walk the chapters
- chapterTr = seriesTr.nextSibling
- dates = []
- descriptions = []
- ratings = []
- chapters = []
- chapter_name_type = None
- while chapterTr is not None and 'sl' in chapterTr['class']:
- description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1]))
- description = stripHTML(chapterTr.findAll("td")[1])
- chapterLink = chapterTr.find("td", "fc").find("a")
- if self.getConfig('chapter_categories_use_all'):
- self.story.addToList('category', chapterTr.findAll("td")[2].text)
- # self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
- pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)
- dates.append(pub_date)
- chapterTr = chapterTr.nextSibling
-
- chapter_title = chapterLink.text
- if self.getConfig("clean_chapter_titles"):
- # logger.debug('\tChapter Name: "%s"' % chapterLink.text)
- seriesTitle = seriesTitle.lower()
- # strip trailing ch or pt before doing the chapter clean.
- # doesn't remove from story title metadata
- seriesTitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',seriesTitle)
- if chapterLink.text.lower().startswith(seriesTitle):
- chapter = chapterLink.text[len(seriesTitle):].strip()
- # logger.debug('\tChapter: "%s"' % chapter)
- if chapter == '':
- chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
- # Sometimes the first chapter does not have type of chapter
- if self.num_chapters() == 0:
- # logger.debug('\tChapter: first chapter without chapter type')
- chapter_name_type = None
- else:
- separater_char = chapter[0]
- # logger.debug('\tseparater_char: "%s"' % separater_char)
- chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
- # logger.debug('\tChapter: "%s"' % chapter)
- if chapter.lower().startswith('ch.'):
- chapter = chapter[len('ch.'):].strip()
- try:
- chapter_title = 'Chapter %d' % int(chapter)
- except:
- chapter_title = 'Chapter %s' % chapter
- chapter_name_type = 'Chapter' if chapter_name_type is None else chapter_name_type
- # logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
- elif chapter.lower().startswith('pt.'):
- chapter = chapter[len('pt.'):].strip()
- try:
- chapter_title = 'Part %d' % int(chapter)
- except:
- chapter_title = 'Part %s' % chapter
- chapter_name_type = 'Part' if chapter_name_type is None else chapter_name_type
- # logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
- elif separater_char in [":", "-"]:
- chapter_title = chapter
- # logger.debug('\tChapter: taking chapter text as whole')
+ ## takes *eroticatags* entries from "+"
\n".join(descriptions)+"
")
+ self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div#tabpanel-tags a.av_as') ])
- if len(ratings) > 0:
- self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
+ if isSingleStory:
+ ## one-shots don't *display* date info, but they have it
+ ## hidden in