Updated recipes - El Correo & El periódico de Aragón [ES]

desUBIKado · 08-09-2013, 05:33 AM

Hi there:

El Correo

- fix format problems
- add cover

Spoiler:

Code:

#!/usr/bin/env  python
__license__     = 'GPL v3'
__copyright__   = '08 Januery 2011, desUBIKado'
__author__      = 'desUBIKado'
__description__ = 'Daily newspaper from Biscay'
__version__     = 'v0.10'
__date__        = '07, August 2013'
'''
http://www.elcorreo.com/
'''

import time
import re
from calibre.web.feeds.news import BasicNewsRecipe

class heraldo(BasicNewsRecipe):
    author                = 'desUBIKado'
    description           = 'Daily newspaper from Biscay'
    title                 = u'El Correo'
    publisher             = 'Vocento'
    category              = 'News, politics, culture, economy, general interest'
    oldest_article        = 2
    delay                 = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    masthead_url          = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
    language              = 'es'
    timefmt               = '[%a, %d %b, %Y]'
    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = False

    feeds              = [
                           (u'Portada',       u'http://www.elcorreo.com/vizcaya/portada.xml'),
                           (u'Local',         u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'),                           
			   (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'),
			   (u'Econom\xeda',   u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'),
                           (u'Pol\xedtica',   u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'),
			   (u'Opini\xf3n',    u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'),
			   (u'Deportes',      u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'),
                           (u'Sociedad',      u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'),
			   (u'Cultura',       u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'),
			   (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'),
			   (u'Gente',         u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml')			
                         ]

    keep_only_tags     = [
                          dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentarios']}),
                          dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']})
                         ]     

    remove_tags        = [
                          dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}),
                          dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}),
                          dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separa','separacion','compartir','tags_relacionados']}),
                          dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopiniones']}),                          
                          dict(name='div', attrs={'class':['modulo-especial','publiEspecial','carruselNoticias','vj','modulocomun2']}),                     
                          dict(name='div', attrs={'id':['articulopina','webs_asociadas']}),  
                          dict(name='br', attrs={'class':'clear'}),
                          dict(name='form', attrs={'name':'frm_conversor2'})                       
                         ]

    remove_tags_before = dict(name='div' , attrs={'class':'articulo  '})
    remove_tags_after  = dict(name='div' , attrs={'class':'robapaginas'})              

    def get_cover_url(self):
       cover = None
       st = time.localtime()
       year = str(st.tm_year)       
       month = "%.2d" % st.tm_mon
       day = "%.2d" % st.tm_mday
		#http://info.elcorreo.com/pdf/07082013-viz.pdf
       cover='http://info.elcorreo.com/pdf/'+ day +  month + year +'-viz.pdf'       
       br = BasicNewsRecipe.get_browser(self)
       try:
           br.open(cover)
       except:
           self.log("\nPortada no disponible")
           cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
       return cover             
                          

    extra_css = '''
                    h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}               
                    h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;}   
                    h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}               
                    h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}   
                    h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;}               
                    h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;}   
                    .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
                    img{margin-bottom: 0.4em}
                '''                    

 

    preprocess_regexps = [     

 # Para presentar la imagen de los video incrustados                           
                           (re.compile(r'var RUTA_IMAGEN', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src'),
                           (re.compile(r'.jpg";', re.DOTALL|re.IGNORECASE), lambda match: '.jpg">'),
                           (re.compile(r'var SITIO = "elcorreo";', re.DOTALL|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'),

# Para separar los parrafos con una linea en blanco
                           (re.compile(r'<div class="p"', re.DOTALL|re.IGNORECASE), lambda match: '<p></p><div class="p"'),

# Para poner una linea en blanco entre el subttulo y la fecha y hora de la noticia
                           (re.compile(r'<div class="date">', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="date">'),

# Para poner una linea en blanco entre la entradilla de los videos incrustados y el texto anterior
                           (re.compile(r'<div class="video"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div class="video"'),

# Para sacar las fotos a partir de la primera cuando se presentan como una galeria
                           (re.compile(r'src="/img/shim.gif"', re.DOTALL|re.IGNORECASE), lambda match: ''),
                           (re.compile(r'rel=', re.DOTALL|re.IGNORECASE), lambda match: 'src='),

# Para quitar el enlace del titulo                 
                           (re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="'),
                           (re.compile(r'</a>\n</h1>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'),

                         ]

El periódico de Aragón

- fix get cover

Spoiler:

Code:

#!/usr/bin/env  python
# -*- coding: utf-8 -*-

__license__     = 'GPL v3'
__copyright__   = '04 December 2010, desUBIKado'
__author__      = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
__version__     = 'v0.09'
__date__        = '07, August 2013'
'''
elperiodicodearagon.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe


class elperiodicodearagon(BasicNewsRecipe):
    title                 = u'El Periodico de Aragon'
    __author__            = u'desUBIKado'
    description           = u'Noticias desde Aragon'
    publisher             = u'elperiodicodearagon.com'
    category              = u'news, politics, Spain, Aragon'
    oldest_article        = 1
    delay                 = 0
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
    masthead_url          = 'http://pdf.elperiodicodearagon.com/img/logotipo.gif'
    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = True


    conversion_options = {
                             'comments'  : description
                            ,'tags'      : category
                            ,'language'  : language
                            ,'publisher' : publisher
                         }

    feeds              = [
                           (u'Portada', u'http://zetaestaticos.com/aragon/rss/portada_es.xml'),
                           (u'Arag\xf3n', u'http://zetaestaticos.com/aragon/rss/2_es.xml'),
                           (u'Internacional', u'http://zetaestaticos.com/aragon/rss/4_es.xml'),
                           (u'Espa\xf1a', u'http://zetaestaticos.com/aragon/rss/3_es.xml'),                          
                           (u'Econom\xeda', u'http://zetaestaticos.com/aragon/rss/5_es.xml'),
                           (u'Deportes', u'http://zetaestaticos.com/aragon/rss/7_es.xml'),
                           (u'Real Zaragoza', u'http://zetaestaticos.com/aragon/rss/10_es.xml'),
                           (u'CAI Zaragoza', u'http://zetaestaticos.com/aragon/rss/91_es.xml'),
                           (u'Monta\xf1ismo', u'http://zetaestaticos.com/aragon/rss/354_es.xml'),
                           (u'Opini\xf3n', u'http://zetaestaticos.com/aragon/rss/103_es.xml'),
                           (u'Tema del d\xeda', u'http://zetaestaticos.com/aragon/rss/102_es.xml'),
                           (u'Escenarios', u'http://zetaestaticos.com/aragon/rss/105_es.xml'),
                           (u'Sociedad', u'http://zetaestaticos.com/aragon/rss/104_es.xml'),
                           (u'Gente', u'http://zetaestaticos.com/aragon/rss/330_es.xml'),
                           (u'Espacio 3', u'http://zetaestaticos.com/aragon/rss/328_es.xml'),
                           (u'Fiestas del Pilar', u'http://zetaestaticos.com/aragon/rss/107_es.xml')
                         ]


    remove_attributes = ['height','width']

    keep_only_tags     = [dict(name='div', attrs={'id':'Noticia'})]                          


    # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)

    def get_cover_url(self):
        index = 'http://pdf.elperiodicodearagon.com/edicion.php'
        soup = self.index_to_soup(index)
        for image in soup.findAll('img',src=True):
           if image['src'].startswith('/funciones/img-public.php?key='):
              return 'http://pdf.elperiodicodearagon.com' + image['src']
        return None    
       
    # Usamos la versión para móviles

    def print_version(self, url):
          return url.replace('http://www.elperiodicodearagon.com/', 'http://www.elperiodicodearagon.com/m/')

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Asian Recipes - 50 Tasty & Easy Unique Exotic Recipes (With Images Of Each Dish And C	asiafoodguru	Self-Promotions by Authors and Publishers	1	08-10-2012 05:01 AM
Updated recipe: Heraldo de Aragon [ES]	desUBIKado	Recipes	0	06-30-2012 09:58 AM
Updated recipe: El periódico de Aragón (Spanish)	desUBIKado	Recipes	0	02-10-2011 02:17 PM