Recipe request - Macleans Magazine

canislupus · 05-24-2011, 10:30 AM

Hi All,

Newbie here and I have a question. Maclean Magazine is not downloading for me in Caliber and when I did a google search I found a recipe in the Caliber forums archives that someone had posted for Macleans Magazine (see below) but when I tried to use it to "Fetch News" in Caliber it doesn't work. Please help, how do I get the magazine downloaded into an e-book? It is a weekly magazine.

Thank you in advance!

#!/usr/bin/env python
__license__ = 'GPL v3'
'''
macleans.ca
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
from datetime import timedelta, datetime, date
class Macleans(BasicNewsRecipe): title = u'Macleans Magazine' __author__ = 'Nick Redding' language = 'en_CA' description = ('Macleans Magazine') no_stylesheets = True timefmt = ' [%b %d]' # customization notes: delete sections you are not interested in # set oldest_article to the maximum number of days back from today to include articles sectionlist = [ ['http://www2.macleans.ca/','Front Page'], ['http://www2.macleans.ca/category/canada/','Canada'], ['http://www2.macleans.ca/category/world-from-the-magazine/','World'], ['http://www2.macleans.ca/category/business','Business'], ['http://www2.macleans.ca/category/arts-culture/','Culture'], ['http://www2.macleans.ca/category/opinion','Opinion'], ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'], ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'], ['http://www2.macleans.ca/category/education/','On Campus'], ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel'] ] oldest_article = 7 # formatting for print version of articles extra_css = '''h2{font-family:Times,serif; font-size:large;} small {font-family:Times,serif; font-size:xx-small; list-style-type: none;} ''' # tag handling for print version of articles keep_only_tags = [dict(id='tw-print')] remove_tags = [dict({'class':'postmetadata'})] def get_browser(self): br = BasicNewsRecipe.get_browser() return br def preprocess_html(self,soup): for img_tag in soup.findAll('img'): parent_tag = img_tag.parent if parent_tag.name == 'a': new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_tag) elif parent_tag.name == 'p': if not self.tag_to_string(parent_tag) == '': new_div = Tag(soup,'div') new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_div) new_div.insert(0,new_tag) new_div.insert(1,parent_tag) return soup def parse_index(self): articles = {} key = None ans = [] def parse_index_page(page_url,page_title): def decode_date(datestr): dmysplit = datestr.strip().lower().split(',') mdsplit = dmysplit[1].split() m = ['january','february','march','april','may','june', 'july','august','september','october','november',' december'].index(mdsplit[0])+1 d = int(mdsplit[1]) y = int(dmysplit[2].split()[0]) return date(y,m,d) def article_title(tag): atag = tag.find('a',href=True) if not atag: return '' return self.tag_to_string(atag) def article_url(tag): atag = tag.find('a',href=True) if not atag: return '' return atag['href']+'print/' def article_description(tag): for p_tag in tag.findAll('p'): d = self.tag_to_string(p_tag,False) if not d == '': return d return '' def compound_h4_h3_title(tag): if tag.h4: if tag.h3: return self.tag_to_string(tag.h4,False)+u'\u2014'+self.ta g_to_string(tag.h3,False) else: return self.tag_to_string(tag.h4,False) elif tag.h3: return self.tag_to_string(tag.h3,False) else: return '' def compound_h2_h4_title(tag): if tag.h2: if tag.h4: return self.tag_to_string(tag.h2,False)+u'\u2014'+self.ta g_to_string(tag.h4,False) else: return self.tag_to_string(tag.h2,False) elif tag.h4: return self.tag_to_string(tag.h4,False) else: return '' def handle_article(header_tag, outer_tag): if header_tag: url = article_url(header_tag) title = article_title(header_tag) author_date_tag = outer_tag.h4 if author_date_tag: author_date = self.tag_to_string(author_date_tag,False).split(' - ') author = author_date[0].strip() article_date = decode_date(author_date[1]) earliest_date = date.today() - timedelta(days=self.oldest_article) if article_date < earliest_date: self.log("Skipping article dated %s" % author_date[1]) else: excerpt_div = outer_tag.find('div','excerpt') if excerpt_div: description = article_description(excerpt_div) else: description = '' if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content='') ) def handle_category_article(cat, header_tag, outer_tag): url = article_url(header_tag) title = article_title(header_tag) if not title == '': title = cat+u'\u2014'+title a_tag = outer_tag.find('span','authorLink') if a_tag: author = self.tag_to_string(a_tag,False) a_tag.parent.extract() else: author = '' description = article_description(outer_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author=author,content='')) soup = self.index_to_soup(page_url) if page_title == 'Front Page': # special processing for the front page top_stories = soup.find('div',{ "id" : "macleansFeatured" }) if top_stories: for div_slide in top_stories.findAll('div','slide'): url = article_url(div_slide) div_title = div_slide.find('div','header') if div_title: title = self.tag_to_string(div_title,False) else: title = '' description = article_description(div_slide) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) from_macleans = soup.find('div',{ "id" : "fromMacleans" }) if from_macleans: for li_tag in from_macleans.findAll('li','fromMacleansArticle'): title = compound_h4_h3_title(li_tag) url = article_url(li_tag) description = article_description(li_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) blog_central = soup.find('div',{ "id" : "bloglist" }) if blog_central: for li_tag in blog_central.findAll('li'): title = compound_h2_h4_title(li_tag) if li_tag.h4: url = article_url(li_tag.h4) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on='',author='',content=''))
# need_to_know = soup.find('div',{ "id" : "needToKnow" })
# if need_to_know:
# for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}):
# title = compound_h4_h3_title(div_tag)
# url = article_url(div_tag)
# description = article_description(div_tag)
# if not articles.has_key(page_title):
# articles[page_title] = []
# articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) for news_category in soup.findAll('div','newsCategory'): news_cat = self.tag_to_string(news_category.h4,False) handle_category_article(news_cat, news_category.find('h2'), news_category.find('div')) for news_item in news_category.findAll('li'): handle_category_article(news_cat,news_item.h3,news _item) return # find the div containing the highlight article div_post = soup.find('div','post') if div_post: h1_tag = div_post.h1 handle_article(h1_tag,div_post) # find the divs containing the rest of the articles div_other = div_post.find('div', { "id" : "categoryOtherPosts" }) if div_other: for div_entry in div_other.findAll('div','entry'): h2_tag = div_entry.h2 handle_article(h2_tag,div_entry) for page_name,page_title in self.sectionlist: parse_index_page(page_name,page_title) ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans

sexymax15 · 06-17-2011, 08:02 AM

The inbuilt recipe doesnot work. Here's my recipe, it works fine,no problem whatsoever.

Quote:

class AdvancedUserRecipe1308306308(BasicNewsRecipe):
title = u'Maclean Magazine (Canada)'
oldest_article = 30
max_articles_per_feed = 12

use_embedded_content = False

remove_empty_feeds = True
no_stylesheets = True
remove_javascript = True
remove_tags = [dict(name ='img'),dict (id='header'),{'class':'postmetadata'}]
remove_tags_after = {'class':'postmetadata'}

feeds = [(u'Blog Central', u'http://www2.macleans.ca/category/blog-central/feed/'),
(u'Canada', u'http://www2.macleans.ca/category/canada/feed/'),
(u'World', u'http://www2.macleans.ca/category/world-from-the-magazine/feed/'),
(u'Business', u'http://www2.macleans.ca/category/business/feed/'),
(u'Arts & Culture', u'http://www2.macleans.ca/category/arts-culture/feed/'),
(u'Opinion', u'http://www2.macleans.ca/category/opinion/feed/'),
(u'Health', u'http://www2.macleans.ca/category/health-from-the-magazine/feed/'),
(u'Environment', u'http://www2.macleans.ca/category/environment-from-the-magazine/feed/')]
def print_version(self, url):
return url + 'print/'

Screenshot

Starson17 · 06-17-2011, 10:34 AM

Quote:

Originally Posted by sexymax15

The inbuilt recipe doesnot work. Here's my recipe, it works fine,no problem whatsoever.

@sexymax15: Thanks for posting several recipes.
A tip: It would be appreciated if you would put your recipes into code tags (use the hash button) and then surround the code tags with spoiler tags (use the button having an eye with a red X over it). The code tags preserve required Python formatting and the spoiler tags make the thread much easier to read.

BRGriff · 06-22-2011, 02:19 PM

I learned something new from this discussion which will help when I myself post a recipe. Thank you.

Spoiler:

Starson17 · 06-22-2011, 02:45 PM

Quote:

Originally Posted by BRGriff

I learned something new from this discussion which will help when I myself post a recipe. Thank you.

It's not immediately obvious that Code tags are needed or that Spoiler tags are helpful, but when you've read a few hundred threads, or tried to answer questions about posted recipes that have lost all the Python-required indents, you will learn to appreciate the advantages of those tags.

Thanks to all who use them!

canislupus · 06-26-2011, 03:32 PM

Thank you sexymax15. Works great.

kindle88 · 07-23-2011, 12:40 PM

I tried sexymax15's recipe. The table of contents shows all the articles but the links don't work.

I also updated Calibre to 0.8.22 (which has an improved Maclean news recipe). Still, the links in the table of contents don't work and also noticed that the articles were not actually downloaded.

Anyone else has this issue?

Starson17 · 07-24-2011, 09:38 AM

Quote:

Originally Posted by kindle88

I tried sexymax15's recipe. The table of contents shows all the articles but the links don't work.

I also updated Calibre to 0.8.22 (which has an improved Maclean news recipe). Still, the links in the table of contents don't work and also noticed that the articles were not actually downloaded.

Anyone else has this issue?

You can try this:
https://www.mobileread.com/forums/sho...439#fix_myself

05-24-2011, 10:30 AM	#1
canislupus Junior Member Posts: 2 Karma: 10 Join Date: May 2011 Device: Kindle 3	Recipe request - Macleans Magazine Hi All, Newbie here and I have a question. Maclean Magazine is not downloading for me in Caliber and when I did a google search I found a recipe in the Caliber forums archives that someone had posted for Macleans Magazine (see below) but when I tried to use it to "Fetch News" in Caliber it doesn't work. Please help, how do I get the magazine downloaded into an e-book? It is a weekly magazine. Thank you in advance! #!/usr/bin/env python __license__ = 'GPL v3' ''' macleans.ca ''' import string, re from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString from datetime import timedelta, datetime, date class Macleans(BasicNewsRecipe): title = u'Macleans Magazine' __author__ = 'Nick Redding' language = 'en_CA' description = ('Macleans Magazine') no_stylesheets = True timefmt = ' [%b %d]' # customization notes: delete sections you are not interested in # set oldest_article to the maximum number of days back from today to include articles sectionlist = [ ['http://www2.macleans.ca/','Front Page'], ['http://www2.macleans.ca/category/canada/','Canada'], ['http://www2.macleans.ca/category/world-from-the-magazine/','World'], ['http://www2.macleans.ca/category/business','Business'], ['http://www2.macleans.ca/category/arts-culture/','Culture'], ['http://www2.macleans.ca/category/opinion','Opinion'], ['http://www2.macleans.ca/category/health-from-the-magazine/','Health'], ['http://www2.macleans.ca/category/environment-from-the-magazine/','Environment'], ['http://www2.macleans.ca/category/education/','On Campus'], ['http://www2.macleans.ca/category/travel-from-the-magazine/','Travel'] ] oldest_article = 7 # formatting for print version of articles extra_css = '''h2{font-family:Times,serif; font-size:large;} small {font-family:Times,serif; font-size:xx-small; list-style-type: none;} ''' # tag handling for print version of articles keep_only_tags = [dict(id='tw-print')] remove_tags = [dict({'class':'postmetadata'})] def get_browser(self): br = BasicNewsRecipe.get_browser() return br def preprocess_html(self,soup): for img_tag in soup.findAll('img'): parent_tag = img_tag.parent if parent_tag.name == 'a': new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_tag) elif parent_tag.name == 'p': if not self.tag_to_string(parent_tag) == '': new_div = Tag(soup,'div') new_tag = Tag(soup,'p') new_tag.insert(0,img_tag) parent_tag.replaceWith(new_div) new_div.insert(0,new_tag) new_div.insert(1,parent_tag) return soup def parse_index(self): articles = {} key = None ans = [] def parse_index_page(page_url,page_title): def decode_date(datestr): dmysplit = datestr.strip().lower().split(',') mdsplit = dmysplit[1].split() m = ['january','february','march','april','may','june', 'july','august','september','october','november',' december'].index(mdsplit[0])+1 d = int(mdsplit[1]) y = int(dmysplit[2].split()[0]) return date(y,m,d) def article_title(tag): atag = tag.find('a',href=True) if not atag: return '' return self.tag_to_string(atag) def article_url(tag): atag = tag.find('a',href=True) if not atag: return '' return atag['href']+'print/' def article_description(tag): for p_tag in tag.findAll('p'): d = self.tag_to_string(p_tag,False) if not d == '': return d return '' def compound_h4_h3_title(tag): if tag.h4: if tag.h3: return self.tag_to_string(tag.h4,False)+u'\u2014'+self.ta g_to_string(tag.h3,False) else: return self.tag_to_string(tag.h4,False) elif tag.h3: return self.tag_to_string(tag.h3,False) else: return '' def compound_h2_h4_title(tag): if tag.h2: if tag.h4: return self.tag_to_string(tag.h2,False)+u'\u2014'+self.ta g_to_string(tag.h4,False) else: return self.tag_to_string(tag.h2,False) elif tag.h4: return self.tag_to_string(tag.h4,False) else: return '' def handle_article(header_tag, outer_tag): if header_tag: url = article_url(header_tag) title = article_title(header_tag) author_date_tag = outer_tag.h4 if author_date_tag: author_date = self.tag_to_string(author_date_tag,False).split(' - ') author = author_date[0].strip() article_date = decode_date(author_date[1]) earliest_date = date.today() - timedelta(days=self.oldest_article) if article_date < earliest_date: self.log("Skipping article dated %s" % author_date[1]) else: excerpt_div = outer_tag.find('div','excerpt') if excerpt_div: description = article_description(excerpt_div) else: description = '' if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date=author_date[1],description=description,author=author,content='') ) def handle_category_article(cat, header_tag, outer_tag): url = article_url(header_tag) title = article_title(header_tag) if not title == '': title = cat+u'\u2014'+title a_tag = outer_tag.find('span','authorLink') if a_tag: author = self.tag_to_string(a_tag,False) a_tag.parent.extract() else: author = '' description = article_description(outer_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author=author,content='')) soup = self.index_to_soup(page_url) if page_title == 'Front Page': # special processing for the front page top_stories = soup.find('div',{ "id" : "macleansFeatured" }) if top_stories: for div_slide in top_stories.findAll('div','slide'): url = article_url(div_slide) div_title = div_slide.find('div','header') if div_title: title = self.tag_to_string(div_title,False) else: title = '' description = article_description(div_slide) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) from_macleans = soup.find('div',{ "id" : "fromMacleans" }) if from_macleans: for li_tag in from_macleans.findAll('li','fromMacleansArticle'): title = compound_h4_h3_title(li_tag) url = article_url(li_tag) description = article_description(li_tag) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) blog_central = soup.find('div',{ "id" : "bloglist" }) if blog_central: for li_tag in blog_central.findAll('li'): title = compound_h2_h4_title(li_tag) if li_tag.h4: url = article_url(li_tag.h4) if not articles.has_key(page_title): articles[page_title] = [] articles[page_title].append(dict(title=title,url=url,date='',descripti on='',author='',content='')) # need_to_know = soup.find('div',{ "id" : "needToKnow" }) # if need_to_know: # for div_tag in need_to_know('div',attrs={'class' : re.compile("^needToKnowArticle")}): # title = compound_h4_h3_title(div_tag) # url = article_url(div_tag) # description = article_description(div_tag) # if not articles.has_key(page_title): # articles[page_title] = [] # articles[page_title].append(dict(title=title,url=url,date='',descripti on=description,author='',content='')) for news_category in soup.findAll('div','newsCategory'): news_cat = self.tag_to_string(news_category.h4,False) handle_category_article(news_cat, news_category.find('h2'), news_category.find('div')) for news_item in news_category.findAll('li'): handle_category_article(news_cat,news_item.h3,news _item) return # find the div containing the highlight article div_post = soup.find('div','post') if div_post: h1_tag = div_post.h1 handle_article(h1_tag,div_post) # find the divs containing the rest of the articles div_other = div_post.find('div', { "id" : "categoryOtherPosts" }) if div_other: for div_entry in div_other.findAll('div','entry'): h2_tag = div_entry.h2 handle_article(h2_tag,div_entry) for page_name,page_title in self.sectionlist: parse_index_page(page_name,page_title) ans.append(page_title) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Reason Magazine request	c0llin	Recipes	4	03-28-2022 02:04 PM
Request: Wired Magazine UK	StalkS	Recipes	4	06-10-2011 04:08 PM
Recipe Request for World Magazine	fbrian	Recipes	3	06-05-2011 11:10 AM
Help request with italian magazine	lorenzo2004	Recipes	1	05-09-2011 05:43 AM
Request: recipe for google new "Think Quarterly" magazine	gloomygod	Recipes	0	03-24-2011 09:08 AM

06-26-2011, 03:32 PM	#6
canislupus Junior Member Posts: 2 Karma: 10 Join Date: May 2011 Device: Kindle 3	Thank you sexymax15. Works great.

07-23-2011, 12:40 PM	#7
kindle88 Member Posts: 13 Karma: 10 Join Date: Jul 2011 Device: kindle	I tried sexymax15's recipe. The table of contents shows all the articles but the links don't work. I also updated Calibre to 0.8.22 (which has an improved Maclean news recipe). Still, the links in the table of contents don't work and also noticed that the articles were not actually downloaded. Anyone else has this issue?

Advert

Advert