2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.15'
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
49 except ImportError: # Python 2.4
52 import cStringIO as StringIO
56 # parse_qs was moved from the cgi module to the urlparse module recently.
58 from urlparse import parse_qs
60 from cgi import parse_qs
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
97 raiseError('Premature end', i)
99 def decodeEscape(match):
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
127 while s[e-bslashes-1] == '\\':
129 if bslashes % 2 == 1:
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
140 if s[i] == '}': # Empty dictionary
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
155 raiseError('Expected comma or closing curly brace', i)
160 if s[i] == ']': # Empty array
165 i = skipSpace(i) # Raise exception if premature end
169 raiseError('Expected a comma or closing bracket', i)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
175 raiseError('Not a boolean (or null)', i)
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 raiseError('Not a number', i)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
195 def preferredencoding():
196 """Get preferred encoding.
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
201 def yield_preferredencoding():
203 pref = locale.getpreferredencoding()
209 return yield_preferredencoding().next()
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
218 entity = matchobj.group(1)
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
230 numstr = u'0%s' % numstr
233 return unichr(long(numstr, base))
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
253 It returns the tuple (stream, definitive_file_name).
257 if sys.platform == 'win32':
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
281 class DownloadError(Exception):
282 """Download Error exception.
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
291 class SameFileError(Exception):
292 """Same File exception.
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
300 class PostProcessingError(Exception):
301 """Post Processing exception.
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
344 Part of this code was copied from:
346 http://techknack.net/python-urllib2-handlers/
348 Andrew Rowls, the author of that code, agreed to release it to the
355 return zlib.decompress(data, -zlib.MAX_WBITS)
357 return zlib.decompress(data)
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
367 def http_request(self, req):
368 for h in std_headers:
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
378 def http_response(self, req, resp):
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
393 class FileDownloader(object):
394 """File Downloader class.
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
454 _download_retcode = None
455 _num_downloads = None
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
468 def format_bytes(bytes):
471 if type(bytes) is str:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
482 def calc_percent(byte_counter, data_len):
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
488 def calc_eta(start, now, total, current):
492 if current == 0 or dif < 0.001: # One millisecond
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
499 return '%02d:%02d' % (eta_mins, eta_secs)
502 def calc_speed(start, now, bytes):
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
514 rate = bytes / elapsed_time
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
534 ie.set_downloader(self)
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
539 pp.set_downloader(self)
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
590 elapsed = now - start_time
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
602 return filename + u'.part'
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
609 def try_rename(self, old_filename, new_filename):
611 if old_filename == new_filename:
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
621 if not os.path.isfile(filename):
623 timestr = last_modified_hdr
626 filetime = timeconvert(timestr)
630 os.utime(filename, (time.time(), filetime))
634 def report_writedescription(self, descfn):
635 """ Report that the description file is being written """
636 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638 def report_writeinfojson(self, infofn):
639 """ Report that the metadata file has been written """
640 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642 def report_destination(self, filename):
643 """Report destination filename."""
644 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
647 """Report download progress."""
648 if self.params.get('noprogress', False):
650 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
651 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
652 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
653 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655 def report_resuming_byte(self, resume_len):
656 """Report attempt to resume at given byte."""
657 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659 def report_retry(self, count, retries):
660 """Report retry in case of HTTP error 5xx"""
661 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663 def report_file_already_downloaded(self, file_name):
664 """Report file has already been fully downloaded."""
666 self.to_screen(u'[download] %s has already been downloaded' % file_name)
667 except (UnicodeEncodeError), err:
668 self.to_screen(u'[download] The file has already been downloaded')
670 def report_unable_to_resume(self):
671 """Report it was impossible to resume download."""
672 self.to_screen(u'[download] Unable to resume')
674 def report_finish(self):
675 """Report download finished."""
676 if self.params.get('noprogress', False):
677 self.to_screen(u'[download] Download completed')
681 def increment_downloads(self):
682 """Increment the ordinal that assigns a number to each file."""
683 self._num_downloads += 1
685 def prepare_filename(self, info_dict):
686 """Generate the output filename."""
688 template_dict = dict(info_dict)
689 template_dict['epoch'] = unicode(long(time.time()))
690 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
691 filename = self.params['outtmpl'] % template_dict
693 except (ValueError, KeyError), err:
694 self.trouble(u'ERROR: invalid system charset or erroneous output template')
697 def process_info(self, info_dict):
698 """Process a single dictionary returned by an InfoExtractor."""
699 filename = self.prepare_filename(info_dict)
700 # Do nothing else if in simulate mode
701 if self.params.get('simulate', False):
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
719 matchtitle=self.params.get('matchtitle',False)
720 rejecttitle=self.params.get('rejecttitle',False)
721 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
722 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
723 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
725 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
726 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
729 if self.params.get('nooverwrites', False) and os.path.exists(filename):
730 self.to_stderr(u'WARNING: file exists and will be skipped')
734 dn = os.path.dirname(filename)
735 if dn != '' and not os.path.exists(dn):
737 except (OSError, IOError), err:
738 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
741 if self.params.get('writedescription', False):
743 descfn = filename + '.description'
744 self.report_writedescription(descfn)
745 descfile = open(descfn, 'wb')
747 descfile.write(info_dict['description'].encode('utf-8'))
750 except (OSError, IOError):
751 self.trouble(u'ERROR: Cannot write description file ' + descfn)
754 if self.params.get('writeinfojson', False):
755 infofn = filename + '.info.json'
756 self.report_writeinfojson(infofn)
759 except (NameError,AttributeError):
760 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
763 infof = open(infofn, 'wb')
765 json.dump(info_dict, infof)
768 except (OSError, IOError):
769 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
773 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
774 except (OSError, IOError), err:
775 raise UnavailableVideoError
776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
777 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
779 except (ContentTooShortError, ), err:
780 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
785 self.post_process(filename, info_dict)
786 except (PostProcessingError), err:
787 self.trouble(u'ERROR: postprocessing: %s' % str(err))
790 def download(self, url_list):
791 """Download a given list of URLs."""
792 if len(url_list) > 1 and self.fixed_template():
793 raise SameFileError(self.params['outtmpl'])
796 suitable_found = False
798 # Go to next InfoExtractor if not suitable
799 if not ie.suitable(url):
802 # Suitable InfoExtractor found
803 suitable_found = True
805 # Extract information from URL and process it
808 # Suitable InfoExtractor had been found; go to next URL
811 if not suitable_found:
812 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
814 return self._download_retcode
816 def post_process(self, filename, ie_info):
817 """Run the postprocessing chain on the given file."""
819 info['filepath'] = filename
825 def _download_with_rtmpdump(self, filename, url, player_url):
826 self.report_destination(filename)
827 tmpfilename = self.temp_name(filename)
829 # Check for rtmpdump first
831 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
832 except (OSError, IOError):
833 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
836 # Download using rtmpdump. rtmpdump returns exit code 2 when
837 # the connection was interrumpted and resuming appears to be
838 # possible. This is part of rtmpdump's normal usage, AFAIK.
839 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
840 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
841 while retval == 2 or retval == 1:
842 prevsize = os.path.getsize(tmpfilename)
843 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
844 time.sleep(5.0) # This seems to be needed
845 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
846 cursize = os.path.getsize(tmpfilename)
847 if prevsize == cursize and retval == 1:
849 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
850 if prevsize == cursize and retval == 2 and cursize > 1024:
851 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
855 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
856 self.try_rename(tmpfilename, filename)
859 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
862 def _do_download(self, filename, url, player_url):
863 # Check file already present
864 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
865 self.report_file_already_downloaded(filename)
868 # Attempt to download using rtmpdump
869 if url.startswith('rtmp'):
870 return self._download_with_rtmpdump(filename, url, player_url)
872 tmpfilename = self.temp_name(filename)
876 # Do not include the Accept-Encoding header
877 headers = {'Youtubedl-no-compression': 'True'}
878 basic_request = urllib2.Request(url, None, headers)
879 request = urllib2.Request(url, None, headers)
881 # Establish possible resume length
882 if os.path.isfile(tmpfilename):
883 resume_len = os.path.getsize(tmpfilename)
887 # Request parameters in case of being able to resume
888 if self.params.get('continuedl', False) and resume_len != 0:
889 self.report_resuming_byte(resume_len)
890 request.add_header('Range', 'bytes=%d-' % resume_len)
894 retries = self.params.get('retries', 0)
895 while count <= retries:
896 # Establish connection
898 data = urllib2.urlopen(request)
900 except (urllib2.HTTPError, ), err:
901 if (err.code < 500 or err.code >= 600) and err.code != 416:
902 # Unexpected HTTP error
904 elif err.code == 416:
905 # Unable to resume (requested range not satisfiable)
907 # Open the connection again without the range header
908 data = urllib2.urlopen(basic_request)
909 content_length = data.info()['Content-Length']
910 except (urllib2.HTTPError, ), err:
911 if err.code < 500 or err.code >= 600:
914 # Examine the reported length
915 if (content_length is not None and
916 (resume_len - 100 < long(content_length) < resume_len + 100)):
917 # The file had already been fully downloaded.
918 # Explanation to the above condition: in issue #175 it was revealed that
919 # YouTube sometimes adds or removes a few bytes from the end of the file,
920 # changing the file size slightly and causing problems for some users. So
921 # I decided to implement a suggested change and consider the file
922 # completely downloaded if the file size differs less than 100 bytes from
923 # the one in the hard drive.
924 self.report_file_already_downloaded(filename)
925 self.try_rename(tmpfilename, filename)
928 # The length does not match, we start the download over
929 self.report_unable_to_resume()
935 self.report_retry(count, retries)
938 self.trouble(u'ERROR: giving up after %s retries' % retries)
941 data_len = data.info().get('Content-length', None)
942 if data_len is not None:
943 data_len = long(data_len) + resume_len
944 data_len_str = self.format_bytes(data_len)
945 byte_counter = 0 + resume_len
951 data_block = data.read(block_size)
953 if len(data_block) == 0:
955 byte_counter += len(data_block)
957 # Open file just in time
960 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
961 assert stream is not None
962 filename = self.undo_temp_name(tmpfilename)
963 self.report_destination(filename)
964 except (OSError, IOError), err:
965 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
968 stream.write(data_block)
969 except (IOError, OSError), err:
970 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
972 block_size = self.best_block_size(after - before, len(data_block))
975 percent_str = self.calc_percent(byte_counter, data_len)
976 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
977 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
978 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
981 self.slow_down(start, byte_counter - resume_len)
984 self.trouble(u'\nERROR: Did not get any data blocks')
988 if data_len is not None and byte_counter != data_len:
989 raise ContentTooShortError(byte_counter, long(data_len))
990 self.try_rename(tmpfilename, filename)
992 # Update file modification time
993 if self.params.get('updatetime', True):
994 self.try_utime(filename, data.info().get('last-modified', None))
999 class InfoExtractor(object):
1000 """Information Extractor class.
1002 Information extractors are the classes that, given a URL, extract
1003 information from the video (or videos) the URL refers to. This
1004 information includes the real video URL, the video title and simplified
1005 title, author and others. The information is stored in a dictionary
1006 which is then passed to the FileDownloader. The FileDownloader
1007 processes this information possibly downloading the video to the file
1008 system, among other possible outcomes. The dictionaries must include
1009 the following fields:
1011 id: Video identifier.
1012 url: Final video URL.
1013 uploader: Nickname of the video uploader.
1014 title: Literal title.
1015 stitle: Simplified title.
1016 ext: Video filename extension.
1017 format: Video format.
1018 player_url: SWF Player URL (may be None).
1020 The following fields are optional. Their primary purpose is to allow
1021 youtube-dl to serve as the backend for a video search function, such
1022 as the one in youtube2mp3. They are only used when their respective
1023 forced printing functions are called:
1025 thumbnail: Full URL to a video thumbnail image.
1026 description: One-line video description.
1028 Subclasses of this one should re-define the _real_initialize() and
1029 _real_extract() methods and define a _VALID_URL regexp.
1030 Probably, they should also be added to the list of extractors.
1036 def __init__(self, downloader=None):
1037 """Constructor. Receives an optional downloader."""
1039 self.set_downloader(downloader)
1041 def suitable(self, url):
1042 """Receives a URL and returns True if suitable for this IE."""
1043 return re.match(self._VALID_URL, url) is not None
1045 def initialize(self):
1046 """Initializes an instance (authentication, etc)."""
1048 self._real_initialize()
1051 def extract(self, url):
1052 """Extracts URL information and returns it in list of dicts."""
1054 return self._real_extract(url)
1056 def set_downloader(self, downloader):
1057 """Sets the downloader for this IE."""
1058 self._downloader = downloader
1060 def _real_initialize(self):
1061 """Real initialization process. Redefine in subclasses."""
1064 def _real_extract(self, url):
1065 """Real extraction process. Redefine in subclasses."""
1069 class YoutubeIE(InfoExtractor):
1070 """Information extractor for youtube.com."""
1072 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1073 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1074 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1075 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1076 _NETRC_MACHINE = 'youtube'
1077 # Listed in order of quality
1078 _available_formats = ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1079 _video_extensions = {
1085 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1089 IE_NAME = u'youtube'
1091 def report_lang(self):
1092 """Report attempt to set language."""
1093 self._downloader.to_screen(u'[youtube] Setting language')
1095 def report_login(self):
1096 """Report attempt to log in."""
1097 self._downloader.to_screen(u'[youtube] Logging in')
1099 def report_age_confirmation(self):
1100 """Report attempt to confirm age."""
1101 self._downloader.to_screen(u'[youtube] Confirming age')
1103 def report_video_webpage_download(self, video_id):
1104 """Report attempt to download video webpage."""
1105 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1107 def report_video_info_webpage_download(self, video_id):
1108 """Report attempt to download video info webpage."""
1109 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1111 def report_information_extraction(self, video_id):
1112 """Report attempt to extract video information."""
1113 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1115 def report_unavailable_format(self, video_id, format):
1116 """Report extracted video URL."""
1117 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1119 def report_rtmp_download(self):
1120 """Indicate the download will use the RTMP protocol."""
1121 self._downloader.to_screen(u'[youtube] RTMP download detected')
1123 def _real_initialize(self):
1124 if self._downloader is None:
1129 downloader_params = self._downloader.params
1131 # Attempt to use provided username and password or .netrc data
1132 if downloader_params.get('username', None) is not None:
1133 username = downloader_params['username']
1134 password = downloader_params['password']
1135 elif downloader_params.get('usenetrc', False):
1137 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1138 if info is not None:
1142 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1143 except (IOError, netrc.NetrcParseError), err:
1144 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1148 request = urllib2.Request(self._LANG_URL)
1151 urllib2.urlopen(request).read()
1152 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1153 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1156 # No authentication to be performed
1157 if username is None:
1162 'current_form': 'loginForm',
1164 'action_login': 'Log In',
1165 'username': username,
1166 'password': password,
1168 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1171 login_results = urllib2.urlopen(request).read()
1172 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1173 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1182 'action_confirm': 'Confirm',
1184 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1186 self.report_age_confirmation()
1187 age_results = urllib2.urlopen(request).read()
1188 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1192 def _real_extract(self, url):
1193 # Extract video id from URL
1194 mobj = re.match(self._VALID_URL, url)
1196 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1198 video_id = mobj.group(2)
1201 self.report_video_webpage_download(video_id)
1202 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1204 video_webpage = urllib2.urlopen(request).read()
1205 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1206 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1209 # Attempt to extract SWF player URL
1210 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1211 if mobj is not None:
1212 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1217 self.report_video_info_webpage_download(video_id)
1218 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1219 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1220 % (video_id, el_type))
1221 request = urllib2.Request(video_info_url)
1223 video_info_webpage = urllib2.urlopen(request).read()
1224 video_info = parse_qs(video_info_webpage)
1225 if 'token' in video_info:
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1230 if 'token' not in video_info:
1231 if 'reason' in video_info:
1232 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1234 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1237 # Start extracting information
1238 self.report_information_extraction(video_id)
1241 if 'author' not in video_info:
1242 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1244 video_uploader = urllib.unquote_plus(video_info['author'][0])
1247 if 'title' not in video_info:
1248 self._downloader.trouble(u'ERROR: unable to extract video title')
1250 video_title = urllib.unquote_plus(video_info['title'][0])
1251 video_title = video_title.decode('utf-8')
1252 video_title = sanitize_title(video_title)
1255 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1256 simple_title = simple_title.strip(ur'_')
1259 if 'thumbnail_url' not in video_info:
1260 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1261 video_thumbnail = ''
1262 else: # don't panic if we can't find it
1263 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1267 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1268 if mobj is not None:
1269 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1270 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1271 for expression in format_expressions:
1273 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1281 video_description = u'No description available.'
1282 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1283 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1284 if mobj is not None:
1285 video_description = mobj.group(1).decode('utf-8')
1287 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1288 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1289 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1290 # TODO use another parser
1293 video_token = urllib.unquote_plus(video_info['token'][0])
1295 # Decide which formats to download
1296 req_format = self._downloader.params.get('format', None)
1298 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1299 self.report_rtmp_download()
1300 video_url_list = [(None, video_info['conn'][0])]
1301 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1302 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1303 url_data = [parse_qs(uds) for uds in url_data_strs]
1304 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1305 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1307 format_limit = self._downloader.params.get('format_limit', None)
1308 if format_limit is not None and format_limit in self._available_formats:
1309 format_list = self._available_formats[self._available_formats.index(format_limit):]
1311 format_list = self._available_formats
1312 existing_formats = [x for x in format_list if x in url_map]
1313 if len(existing_formats) == 0:
1314 self._downloader.trouble(u'ERROR: no known formats available for video')
1316 if req_format is None:
1317 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1318 elif req_format == '-1':
1319 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1322 if req_format not in url_map:
1323 self._downloader.trouble(u'ERROR: requested format not available')
1325 video_url_list = [(req_format, url_map[req_format])] # Specific format
1327 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1330 for format_param, video_real_url in video_url_list:
1331 # At this point we have a new video
1332 self._downloader.increment_downloads()
1335 video_extension = self._video_extensions.get(format_param, 'flv')
1338 # Process video information
1339 self._downloader.process_info({
1340 'id': video_id.decode('utf-8'),
1341 'url': video_real_url.decode('utf-8'),
1342 'uploader': video_uploader.decode('utf-8'),
1343 'upload_date': upload_date,
1344 'title': video_title,
1345 'stitle': simple_title,
1346 'ext': video_extension.decode('utf-8'),
1347 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1348 'thumbnail': video_thumbnail.decode('utf-8'),
1349 'description': video_description,
1350 'player_url': player_url,
1352 except UnavailableVideoError, err:
1353 self._downloader.trouble(u'\nERROR: unable to download video')
1356 class MetacafeIE(InfoExtractor):
1357 """Information Extractor for metacafe.com."""
1359 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1360 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1361 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1363 IE_NAME = u'metacafe'
1365 def __init__(self, youtube_ie, downloader=None):
1366 InfoExtractor.__init__(self, downloader)
1367 self._youtube_ie = youtube_ie
1369 def report_disclaimer(self):
1370 """Report disclaimer retrieval."""
1371 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1373 def report_age_confirmation(self):
1374 """Report attempt to confirm age."""
1375 self._downloader.to_screen(u'[metacafe] Confirming age')
1377 def report_download_webpage(self, video_id):
1378 """Report webpage download."""
1379 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1381 def report_extraction(self, video_id):
1382 """Report information extraction."""
1383 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1385 def _real_initialize(self):
1386 # Retrieve disclaimer
1387 request = urllib2.Request(self._DISCLAIMER)
1389 self.report_disclaimer()
1390 disclaimer = urllib2.urlopen(request).read()
1391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1392 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1398 'submit': "Continue - I'm over 18",
1400 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1402 self.report_age_confirmation()
1403 disclaimer = urllib2.urlopen(request).read()
1404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1408 def _real_extract(self, url):
1409 # Extract id and simplified title from URL
1410 mobj = re.match(self._VALID_URL, url)
1412 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1415 video_id = mobj.group(1)
1417 # Check if video comes from YouTube
1418 mobj2 = re.match(r'^yt-(.*)$', video_id)
1419 if mobj2 is not None:
1420 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1423 # At this point we have a new video
1424 self._downloader.increment_downloads()
1426 simple_title = mobj.group(2).decode('utf-8')
1428 # Retrieve video webpage to extract further information
1429 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1431 self.report_download_webpage(video_id)
1432 webpage = urllib2.urlopen(request).read()
1433 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1434 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1437 # Extract URL, uploader and title from webpage
1438 self.report_extraction(video_id)
1439 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1440 if mobj is not None:
1441 mediaURL = urllib.unquote(mobj.group(1))
1442 video_extension = mediaURL[-3:]
1444 # Extract gdaKey if available
1445 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1447 video_url = mediaURL
1449 gdaKey = mobj.group(1)
1450 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1452 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1454 self._downloader.trouble(u'ERROR: unable to extract media URL')
1456 vardict = parse_qs(mobj.group(1))
1457 if 'mediaData' not in vardict:
1458 self._downloader.trouble(u'ERROR: unable to extract media URL')
1460 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1462 self._downloader.trouble(u'ERROR: unable to extract media URL')
1464 mediaURL = mobj.group(1).replace('\\/', '/')
1465 video_extension = mediaURL[-3:]
1466 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1468 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1470 self._downloader.trouble(u'ERROR: unable to extract title')
1472 video_title = mobj.group(1).decode('utf-8')
1473 video_title = sanitize_title(video_title)
1475 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1477 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1479 video_uploader = mobj.group(1)
1482 # Process video information
1483 self._downloader.process_info({
1484 'id': video_id.decode('utf-8'),
1485 'url': video_url.decode('utf-8'),
1486 'uploader': video_uploader.decode('utf-8'),
1487 'upload_date': u'NA',
1488 'title': video_title,
1489 'stitle': simple_title,
1490 'ext': video_extension.decode('utf-8'),
1494 except UnavailableVideoError:
1495 self._downloader.trouble(u'\nERROR: unable to download video')
1498 class DailymotionIE(InfoExtractor):
1499 """Information Extractor for Dailymotion"""
1501 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1502 IE_NAME = u'dailymotion'
1504 def __init__(self, downloader=None):
1505 InfoExtractor.__init__(self, downloader)
1507 def report_download_webpage(self, video_id):
1508 """Report webpage download."""
1509 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1511 def report_extraction(self, video_id):
1512 """Report information extraction."""
1513 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1515 def _real_initialize(self):
1518 def _real_extract(self, url):
1519 # Extract id and simplified title from URL
1520 mobj = re.match(self._VALID_URL, url)
1522 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1525 # At this point we have a new video
1526 self._downloader.increment_downloads()
1527 video_id = mobj.group(1)
1529 simple_title = mobj.group(2).decode('utf-8')
1530 video_extension = 'flv'
1532 # Retrieve video webpage to extract further information
1533 request = urllib2.Request(url)
1534 request.add_header('Cookie', 'family_filter=off')
1536 self.report_download_webpage(video_id)
1537 webpage = urllib2.urlopen(request).read()
1538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1539 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1542 # Extract URL, uploader and title from webpage
1543 self.report_extraction(video_id)
1544 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1546 self._downloader.trouble(u'ERROR: unable to extract media URL')
1548 sequence = urllib.unquote(mobj.group(1))
1549 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1551 self._downloader.trouble(u'ERROR: unable to extract media URL')
1553 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1555 # if needed add http://www.dailymotion.com/ if relative URL
1557 video_url = mediaURL
1559 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1561 self._downloader.trouble(u'ERROR: unable to extract title')
1563 video_title = mobj.group(1).decode('utf-8')
1564 video_title = sanitize_title(video_title)
1566 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1568 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1570 video_uploader = mobj.group(1)
1573 # Process video information
1574 self._downloader.process_info({
1575 'id': video_id.decode('utf-8'),
1576 'url': video_url.decode('utf-8'),
1577 'uploader': video_uploader.decode('utf-8'),
1578 'upload_date': u'NA',
1579 'title': video_title,
1580 'stitle': simple_title,
1581 'ext': video_extension.decode('utf-8'),
1585 except UnavailableVideoError:
1586 self._downloader.trouble(u'\nERROR: unable to download video')
1589 class GoogleIE(InfoExtractor):
1590 """Information extractor for video.google.com."""
1592 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1593 IE_NAME = u'video.google'
1595 def __init__(self, downloader=None):
1596 InfoExtractor.__init__(self, downloader)
1598 def report_download_webpage(self, video_id):
1599 """Report webpage download."""
1600 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1602 def report_extraction(self, video_id):
1603 """Report information extraction."""
1604 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1606 def _real_initialize(self):
1609 def _real_extract(self, url):
1610 # Extract id from URL
1611 mobj = re.match(self._VALID_URL, url)
1613 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1616 # At this point we have a new video
1617 self._downloader.increment_downloads()
1618 video_id = mobj.group(1)
1620 video_extension = 'mp4'
1622 # Retrieve video webpage to extract further information
1623 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1625 self.report_download_webpage(video_id)
1626 webpage = urllib2.urlopen(request).read()
1627 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1628 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1631 # Extract URL, uploader, and title from webpage
1632 self.report_extraction(video_id)
1633 mobj = re.search(r"download_url:'([^']+)'", webpage)
1635 video_extension = 'flv'
1636 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1638 self._downloader.trouble(u'ERROR: unable to extract media URL')
1640 mediaURL = urllib.unquote(mobj.group(1))
1641 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1642 mediaURL = mediaURL.replace('\\x26', '\x26')
1644 video_url = mediaURL
1646 mobj = re.search(r'<title>(.*)</title>', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract title')
1650 video_title = mobj.group(1).decode('utf-8')
1651 video_title = sanitize_title(video_title)
1652 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1654 # Extract video description
1655 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract video description')
1659 video_description = mobj.group(1).decode('utf-8')
1660 if not video_description:
1661 video_description = 'No description available.'
1663 # Extract video thumbnail
1664 if self._downloader.params.get('forcethumbnail', False):
1665 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1667 webpage = urllib2.urlopen(request).read()
1668 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1669 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1671 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1673 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1675 video_thumbnail = mobj.group(1)
1676 else: # we need something to pass to process_info
1677 video_thumbnail = ''
1680 # Process video information
1681 self._downloader.process_info({
1682 'id': video_id.decode('utf-8'),
1683 'url': video_url.decode('utf-8'),
1685 'upload_date': u'NA',
1686 'title': video_title,
1687 'stitle': simple_title,
1688 'ext': video_extension.decode('utf-8'),
1692 except UnavailableVideoError:
1693 self._downloader.trouble(u'\nERROR: unable to download video')
1696 class PhotobucketIE(InfoExtractor):
1697 """Information extractor for photobucket.com."""
1699 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1700 IE_NAME = u'photobucket'
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1705 def report_download_webpage(self, video_id):
1706 """Report webpage download."""
1707 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1709 def report_extraction(self, video_id):
1710 """Report information extraction."""
1711 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1713 def _real_initialize(self):
1716 def _real_extract(self, url):
1717 # Extract id from URL
1718 mobj = re.match(self._VALID_URL, url)
1720 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1723 # At this point we have a new video
1724 self._downloader.increment_downloads()
1725 video_id = mobj.group(1)
1727 video_extension = 'flv'
1729 # Retrieve video webpage to extract further information
1730 request = urllib2.Request(url)
1732 self.report_download_webpage(video_id)
1733 webpage = urllib2.urlopen(request).read()
1734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1738 # Extract URL, uploader, and title from webpage
1739 self.report_extraction(video_id)
1740 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1742 self._downloader.trouble(u'ERROR: unable to extract media URL')
1744 mediaURL = urllib.unquote(mobj.group(1))
1746 video_url = mediaURL
1748 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1750 self._downloader.trouble(u'ERROR: unable to extract title')
1752 video_title = mobj.group(1).decode('utf-8')
1753 video_title = sanitize_title(video_title)
1754 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1756 video_uploader = mobj.group(2).decode('utf-8')
1759 # Process video information
1760 self._downloader.process_info({
1761 'id': video_id.decode('utf-8'),
1762 'url': video_url.decode('utf-8'),
1763 'uploader': video_uploader,
1764 'upload_date': u'NA',
1765 'title': video_title,
1766 'stitle': simple_title,
1767 'ext': video_extension.decode('utf-8'),
1771 except UnavailableVideoError:
1772 self._downloader.trouble(u'\nERROR: unable to download video')
1775 class YahooIE(InfoExtractor):
1776 """Information extractor for video.yahoo.com."""
1778 # _VALID_URL matches all Yahoo! Video URLs
1779 # _VPAGE_URL matches only the extractable '/watch/' URLs
1780 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1781 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1782 IE_NAME = u'video.yahoo'
1784 def __init__(self, downloader=None):
1785 InfoExtractor.__init__(self, downloader)
1787 def report_download_webpage(self, video_id):
1788 """Report webpage download."""
1789 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1791 def report_extraction(self, video_id):
1792 """Report information extraction."""
1793 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1795 def _real_initialize(self):
1798 def _real_extract(self, url, new_video=True):
1799 # Extract ID from URL
1800 mobj = re.match(self._VALID_URL, url)
1802 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1805 # At this point we have a new video
1806 self._downloader.increment_downloads()
1807 video_id = mobj.group(2)
1808 video_extension = 'flv'
1810 # Rewrite valid but non-extractable URLs as
1811 # extractable English language /watch/ URLs
1812 if re.match(self._VPAGE_URL, url) is None:
1813 request = urllib2.Request(url)
1815 webpage = urllib2.urlopen(request).read()
1816 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1817 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1820 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1822 self._downloader.trouble(u'ERROR: Unable to extract id field')
1824 yahoo_id = mobj.group(1)
1826 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1828 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1830 yahoo_vid = mobj.group(1)
1832 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1833 return self._real_extract(url, new_video=False)
1835 # Retrieve video webpage to extract further information
1836 request = urllib2.Request(url)
1838 self.report_download_webpage(video_id)
1839 webpage = urllib2.urlopen(request).read()
1840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1844 # Extract uploader and title from webpage
1845 self.report_extraction(video_id)
1846 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1848 self._downloader.trouble(u'ERROR: unable to extract video title')
1850 video_title = mobj.group(1).decode('utf-8')
1851 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1853 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1857 video_uploader = mobj.group(1).decode('utf-8')
1859 # Extract video thumbnail
1860 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1862 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1864 video_thumbnail = mobj.group(1).decode('utf-8')
1866 # Extract video description
1867 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1869 self._downloader.trouble(u'ERROR: unable to extract video description')
1871 video_description = mobj.group(1).decode('utf-8')
1872 if not video_description:
1873 video_description = 'No description available.'
1875 # Extract video height and width
1876 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1878 self._downloader.trouble(u'ERROR: unable to extract video height')
1880 yv_video_height = mobj.group(1)
1882 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1884 self._downloader.trouble(u'ERROR: unable to extract video width')
1886 yv_video_width = mobj.group(1)
1888 # Retrieve video playlist to extract media URL
1889 # I'm not completely sure what all these options are, but we
1890 # seem to need most of them, otherwise the server sends a 401.
1891 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1892 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1893 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1894 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1895 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1897 self.report_download_webpage(video_id)
1898 webpage = urllib2.urlopen(request).read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1903 # Extract media URL from playlist XML
1904 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1906 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1908 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1909 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1912 # Process video information
1913 self._downloader.process_info({
1914 'id': video_id.decode('utf-8'),
1916 'uploader': video_uploader,
1917 'upload_date': u'NA',
1918 'title': video_title,
1919 'stitle': simple_title,
1920 'ext': video_extension.decode('utf-8'),
1921 'thumbnail': video_thumbnail.decode('utf-8'),
1922 'description': video_description,
1923 'thumbnail': video_thumbnail,
1926 except UnavailableVideoError:
1927 self._downloader.trouble(u'\nERROR: unable to download video')
1930 class VimeoIE(InfoExtractor):
1931 """Information extractor for vimeo.com."""
1933 # _VALID_URL matches Vimeo URLs
1934 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1937 def __init__(self, downloader=None):
1938 InfoExtractor.__init__(self, downloader)
1940 def report_download_webpage(self, video_id):
1941 """Report webpage download."""
1942 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1944 def report_extraction(self, video_id):
1945 """Report information extraction."""
1946 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1948 def _real_initialize(self):
1951 def _real_extract(self, url, new_video=True):
1952 # Extract ID from URL
1953 mobj = re.match(self._VALID_URL, url)
1955 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1958 # At this point we have a new video
1959 self._downloader.increment_downloads()
1960 video_id = mobj.group(1)
1962 # Retrieve video webpage to extract further information
1963 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1965 self.report_download_webpage(video_id)
1966 webpage = urllib2.urlopen(request).read()
1967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1968 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1971 # Now we begin extracting as much information as we can from what we
1972 # retrieved. First we extract the information common to all extractors,
1973 # and latter we extract those that are Vimeo specific.
1974 self.report_extraction(video_id)
1977 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1979 self._downloader.trouble(u'ERROR: unable to extract video title')
1981 video_title = mobj.group(1).decode('utf-8')
1982 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1985 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1987 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1989 video_uploader = mobj.group(1).decode('utf-8')
1991 # Extract video thumbnail
1992 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1994 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1996 video_thumbnail = mobj.group(1).decode('utf-8')
1998 # # Extract video description
1999 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2001 # self._downloader.trouble(u'ERROR: unable to extract video description')
2003 # video_description = mobj.group(1).decode('utf-8')
2004 # if not video_description: video_description = 'No description available.'
2005 video_description = 'Foo.'
2007 # Vimeo specific: extract request signature
2008 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2010 self._downloader.trouble(u'ERROR: unable to extract request signature')
2012 sig = mobj.group(1).decode('utf-8')
2014 # Vimeo specific: Extract request signature expiration
2015 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2017 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2019 sig_exp = mobj.group(1).decode('utf-8')
2021 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2024 # Process video information
2025 self._downloader.process_info({
2026 'id': video_id.decode('utf-8'),
2028 'uploader': video_uploader,
2029 'upload_date': u'NA',
2030 'title': video_title,
2031 'stitle': simple_title,
2033 'thumbnail': video_thumbnail.decode('utf-8'),
2034 'description': video_description,
2035 'thumbnail': video_thumbnail,
2036 'description': video_description,
2039 except UnavailableVideoError:
2040 self._downloader.trouble(u'ERROR: unable to download video')
2043 class GenericIE(InfoExtractor):
2044 """Generic last-resort information extractor."""
2047 IE_NAME = u'generic'
2049 def __init__(self, downloader=None):
2050 InfoExtractor.__init__(self, downloader)
2052 def report_download_webpage(self, video_id):
2053 """Report webpage download."""
2054 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2055 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2057 def report_extraction(self, video_id):
2058 """Report information extraction."""
2059 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2061 def _real_initialize(self):
2064 def _real_extract(self, url):
2065 # At this point we have a new video
2066 self._downloader.increment_downloads()
2068 video_id = url.split('/')[-1]
2069 request = urllib2.Request(url)
2071 self.report_download_webpage(video_id)
2072 webpage = urllib2.urlopen(request).read()
2073 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2074 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2076 except ValueError, err:
2077 # since this is the last-resort InfoExtractor, if
2078 # this error is thrown, it'll be thrown here
2079 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2082 self.report_extraction(video_id)
2083 # Start with something easy: JW Player in SWFObject
2084 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2086 # Broaden the search a little bit
2087 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2089 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2092 # It's possible that one of the regexes
2093 # matched, but returned an empty group:
2094 if mobj.group(1) is None:
2095 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2098 video_url = urllib.unquote(mobj.group(1))
2099 video_id = os.path.basename(video_url)
2101 # here's a fun little line of code for you:
2102 video_extension = os.path.splitext(video_id)[1][1:]
2103 video_id = os.path.splitext(video_id)[0]
2105 # it's tempting to parse this further, but you would
2106 # have to take into account all the variations like
2107 # Video Title - Site Name
2108 # Site Name | Video Title
2109 # Video Title - Tagline | Site Name
2110 # and so on and so forth; it's just not practical
2111 mobj = re.search(r'<title>(.*)</title>', webpage)
2113 self._downloader.trouble(u'ERROR: unable to extract title')
2115 video_title = mobj.group(1).decode('utf-8')
2116 video_title = sanitize_title(video_title)
2117 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2119 # video uploader is domain name
2120 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2122 self._downloader.trouble(u'ERROR: unable to extract title')
2124 video_uploader = mobj.group(1).decode('utf-8')
2127 # Process video information
2128 self._downloader.process_info({
2129 'id': video_id.decode('utf-8'),
2130 'url': video_url.decode('utf-8'),
2131 'uploader': video_uploader,
2132 'upload_date': u'NA',
2133 'title': video_title,
2134 'stitle': simple_title,
2135 'ext': video_extension.decode('utf-8'),
2139 except UnavailableVideoError, err:
2140 self._downloader.trouble(u'\nERROR: unable to download video')
2143 class YoutubeSearchIE(InfoExtractor):
2144 """Information Extractor for YouTube search queries."""
2145 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2146 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2147 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2148 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2150 _max_youtube_results = 1000
2151 IE_NAME = u'youtube:search'
2153 def __init__(self, youtube_ie, downloader=None):
2154 InfoExtractor.__init__(self, downloader)
2155 self._youtube_ie = youtube_ie
2157 def report_download_page(self, query, pagenum):
2158 """Report attempt to download playlist page with given number."""
2159 query = query.decode(preferredencoding())
2160 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2162 def _real_initialize(self):
2163 self._youtube_ie.initialize()
2165 def _real_extract(self, query):
2166 mobj = re.match(self._VALID_URL, query)
2168 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2171 prefix, query = query.split(':')
2173 query = query.encode('utf-8')
2175 self._download_n_results(query, 1)
2177 elif prefix == 'all':
2178 self._download_n_results(query, self._max_youtube_results)
2184 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2186 elif n > self._max_youtube_results:
2187 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2188 n = self._max_youtube_results
2189 self._download_n_results(query, n)
2191 except ValueError: # parsing prefix as integer fails
2192 self._download_n_results(query, 1)
2195 def _download_n_results(self, query, n):
2196 """Downloads a specified number of results for a query"""
2199 already_seen = set()
2203 self.report_download_page(query, pagenum)
2204 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2205 request = urllib2.Request(result_url)
2207 page = urllib2.urlopen(request).read()
2208 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2209 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2212 # Extract video identifiers
2213 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2214 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2215 if video_id not in already_seen:
2216 video_ids.append(video_id)
2217 already_seen.add(video_id)
2218 if len(video_ids) == n:
2219 # Specified n videos reached
2220 for id in video_ids:
2221 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2224 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2225 for id in video_ids:
2226 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2229 pagenum = pagenum + 1
2232 class GoogleSearchIE(InfoExtractor):
2233 """Information Extractor for Google Video search queries."""
2234 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2235 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2236 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2237 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2239 _max_google_results = 1000
2240 IE_NAME = u'video.google:search'
2242 def __init__(self, google_ie, downloader=None):
2243 InfoExtractor.__init__(self, downloader)
2244 self._google_ie = google_ie
2246 def report_download_page(self, query, pagenum):
2247 """Report attempt to download playlist page with given number."""
2248 query = query.decode(preferredencoding())
2249 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2251 def _real_initialize(self):
2252 self._google_ie.initialize()
2254 def _real_extract(self, query):
2255 mobj = re.match(self._VALID_URL, query)
2257 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2260 prefix, query = query.split(':')
2262 query = query.encode('utf-8')
2264 self._download_n_results(query, 1)
2266 elif prefix == 'all':
2267 self._download_n_results(query, self._max_google_results)
2273 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2275 elif n > self._max_google_results:
2276 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2277 n = self._max_google_results
2278 self._download_n_results(query, n)
2280 except ValueError: # parsing prefix as integer fails
2281 self._download_n_results(query, 1)
2284 def _download_n_results(self, query, n):
2285 """Downloads a specified number of results for a query"""
2288 already_seen = set()
2292 self.report_download_page(query, pagenum)
2293 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2294 request = urllib2.Request(result_url)
2296 page = urllib2.urlopen(request).read()
2297 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2298 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2301 # Extract video identifiers
2302 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2303 video_id = mobj.group(1)
2304 if video_id not in already_seen:
2305 video_ids.append(video_id)
2306 already_seen.add(video_id)
2307 if len(video_ids) == n:
2308 # Specified n videos reached
2309 for id in video_ids:
2310 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2313 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2314 for id in video_ids:
2315 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2318 pagenum = pagenum + 1
2321 class YahooSearchIE(InfoExtractor):
2322 """Information Extractor for Yahoo! Video search queries."""
2323 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2324 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2325 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2326 _MORE_PAGES_INDICATOR = r'\s*Next'
2328 _max_yahoo_results = 1000
2329 IE_NAME = u'video.yahoo:search'
2331 def __init__(self, yahoo_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._yahoo_ie = yahoo_ie
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2340 def _real_initialize(self):
2341 self._yahoo_ie.initialize()
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349 prefix, query = query.split(':')
2351 query = query.encode('utf-8')
2353 self._download_n_results(query, 1)
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_yahoo_results)
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2364 elif n > self._max_yahoo_results:
2365 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2366 n = self._max_yahoo_results
2367 self._download_n_results(query, n)
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2377 already_seen = set()
2381 self.report_download_page(query, pagenum)
2382 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2383 request = urllib2.Request(result_url)
2385 page = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2390 # Extract video identifiers
2391 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2392 video_id = mobj.group(1)
2393 if video_id not in already_seen:
2394 video_ids.append(video_id)
2395 already_seen.add(video_id)
2396 if len(video_ids) == n:
2397 # Specified n videos reached
2398 for id in video_ids:
2399 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2402 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2403 for id in video_ids:
2404 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2407 pagenum = pagenum + 1
2410 class YoutubePlaylistIE(InfoExtractor):
2411 """Information Extractor for YouTube playlists."""
2413 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2414 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2415 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2416 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2418 IE_NAME = u'youtube:playlist'
2420 def __init__(self, youtube_ie, downloader=None):
2421 InfoExtractor.__init__(self, downloader)
2422 self._youtube_ie = youtube_ie
2424 def report_download_page(self, playlist_id, pagenum):
2425 """Report attempt to download playlist page with given number."""
2426 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2428 def _real_initialize(self):
2429 self._youtube_ie.initialize()
2431 def _real_extract(self, url):
2432 # Extract playlist id
2433 mobj = re.match(self._VALID_URL, url)
2435 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2439 if mobj.group(3) is not None:
2440 self._youtube_ie.extract(mobj.group(3))
2443 # Download playlist pages
2444 # prefix is 'p' as default for playlists but there are other types that need extra care
2445 playlist_prefix = mobj.group(1)
2446 if playlist_prefix == 'a':
2447 playlist_access = 'artist'
2449 playlist_prefix = 'p'
2450 playlist_access = 'view_play_list'
2451 playlist_id = mobj.group(2)
2456 self.report_download_page(playlist_id, pagenum)
2457 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2459 page = urllib2.urlopen(request).read()
2460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2461 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2464 # Extract video identifiers
2466 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2467 if mobj.group(1) not in ids_in_page:
2468 ids_in_page.append(mobj.group(1))
2469 video_ids.extend(ids_in_page)
2471 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2473 pagenum = pagenum + 1
2475 playliststart = self._downloader.params.get('playliststart', 1) - 1
2476 playlistend = self._downloader.params.get('playlistend', -1)
2477 video_ids = video_ids[playliststart:playlistend]
2479 for id in video_ids:
2480 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2484 class YoutubeUserIE(InfoExtractor):
2485 """Information Extractor for YouTube users."""
2487 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2488 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2489 _GDATA_PAGE_SIZE = 50
2490 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2491 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2493 IE_NAME = u'youtube:user'
2495 def __init__(self, youtube_ie, downloader=None):
2496 InfoExtractor.__init__(self, downloader)
2497 self._youtube_ie = youtube_ie
2499 def report_download_page(self, username, start_index):
2500 """Report attempt to download user page."""
2501 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2502 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2504 def _real_initialize(self):
2505 self._youtube_ie.initialize()
2507 def _real_extract(self, url):
2509 mobj = re.match(self._VALID_URL, url)
2511 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2514 username = mobj.group(1)
2516 # Download video ids using YouTube Data API. Result size per
2517 # query is limited (currently to 50 videos) so we need to query
2518 # page by page until there are no video ids - it means we got
2525 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2526 self.report_download_page(username, start_index)
2528 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2531 page = urllib2.urlopen(request).read()
2532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2533 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2536 # Extract video identifiers
2539 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2540 if mobj.group(1) not in ids_in_page:
2541 ids_in_page.append(mobj.group(1))
2543 video_ids.extend(ids_in_page)
2545 # A little optimization - if current page is not
2546 # "full", ie. does not contain PAGE_SIZE video ids then
2547 # we can assume that this page is the last one - there
2548 # are no more ids on further pages - no need to query
2551 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2556 all_ids_count = len(video_ids)
2557 playliststart = self._downloader.params.get('playliststart', 1) - 1
2558 playlistend = self._downloader.params.get('playlistend', -1)
2560 if playlistend == -1:
2561 video_ids = video_ids[playliststart:]
2563 video_ids = video_ids[playliststart:playlistend]
2565 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2566 (username, all_ids_count, len(video_ids)))
2568 for video_id in video_ids:
2569 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2572 class DepositFilesIE(InfoExtractor):
2573 """Information extractor for depositfiles.com"""
2575 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2576 IE_NAME = u'DepositFiles'
2578 def __init__(self, downloader=None):
2579 InfoExtractor.__init__(self, downloader)
2581 def report_download_webpage(self, file_id):
2582 """Report webpage download."""
2583 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2585 def report_extraction(self, file_id):
2586 """Report information extraction."""
2587 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2589 def _real_initialize(self):
2592 def _real_extract(self, url):
2593 # At this point we have a new file
2594 self._downloader.increment_downloads()
2596 file_id = url.split('/')[-1]
2597 # Rebuild url in english locale
2598 url = 'http://depositfiles.com/en/files/' + file_id
2600 # Retrieve file webpage with 'Free download' button pressed
2601 free_download_indication = { 'gateway_result' : '1' }
2602 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2604 self.report_download_webpage(file_id)
2605 webpage = urllib2.urlopen(request).read()
2606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2610 # Search for the real file URL
2611 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2612 if (mobj is None) or (mobj.group(1) is None):
2613 # Try to figure out reason of the error.
2614 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2615 if (mobj is not None) and (mobj.group(1) is not None):
2616 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2617 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2619 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2622 file_url = mobj.group(1)
2623 file_extension = os.path.splitext(file_url)[1][1:]
2625 # Search for file title
2626 mobj = re.search(r'<b title="(.*?)">', webpage)
2628 self._downloader.trouble(u'ERROR: unable to extract title')
2630 file_title = mobj.group(1).decode('utf-8')
2633 # Process file information
2634 self._downloader.process_info({
2635 'id': file_id.decode('utf-8'),
2636 'url': file_url.decode('utf-8'),
2638 'upload_date': u'NA',
2639 'title': file_title,
2640 'stitle': file_title,
2641 'ext': file_extension.decode('utf-8'),
2645 except UnavailableVideoError, err:
2646 self._downloader.trouble(u'ERROR: unable to download file')
2649 class FacebookIE(InfoExtractor):
2650 """Information Extractor for Facebook"""
2652 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2653 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2654 _NETRC_MACHINE = 'facebook'
2655 _available_formats = ['highqual', 'lowqual']
2656 _video_extensions = {
2660 IE_NAME = u'facebook'
2662 def __init__(self, downloader=None):
2663 InfoExtractor.__init__(self, downloader)
2665 def _reporter(self, message):
2666 """Add header and report message."""
2667 self._downloader.to_screen(u'[facebook] %s' % message)
2669 def report_login(self):
2670 """Report attempt to log in."""
2671 self._reporter(u'Logging in')
2673 def report_video_webpage_download(self, video_id):
2674 """Report attempt to download video webpage."""
2675 self._reporter(u'%s: Downloading video webpage' % video_id)
2677 def report_information_extraction(self, video_id):
2678 """Report attempt to extract video information."""
2679 self._reporter(u'%s: Extracting video information' % video_id)
2681 def _parse_page(self, video_webpage):
2682 """Extract video information from page"""
2684 data = {'title': r'class="video_title datawrap">(.*?)</',
2685 'description': r'<div class="datawrap">(.*?)</div>',
2686 'owner': r'\("video_owner_name", "(.*?)"\)',
2687 'upload_date': r'data-date="(.*?)"',
2688 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2691 for piece in data.keys():
2692 mobj = re.search(data[piece], video_webpage)
2693 if mobj is not None:
2694 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2698 for fmt in self._available_formats:
2699 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2700 if mobj is not None:
2701 # URL is in a Javascript segment inside an escaped Unicode format within
2702 # the generally utf-8 page
2703 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2704 video_info['video_urls'] = video_urls
2708 def _real_initialize(self):
2709 if self._downloader is None:
2714 downloader_params = self._downloader.params
2716 # Attempt to use provided username and password or .netrc data
2717 if downloader_params.get('username', None) is not None:
2718 useremail = downloader_params['username']
2719 password = downloader_params['password']
2720 elif downloader_params.get('usenetrc', False):
2722 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2723 if info is not None:
2727 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2728 except (IOError, netrc.NetrcParseError), err:
2729 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2732 if useremail is None:
2741 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2744 login_results = urllib2.urlopen(request).read()
2745 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2746 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2749 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2752 def _real_extract(self, url):
2753 mobj = re.match(self._VALID_URL, url)
2755 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2757 video_id = mobj.group('ID')
2760 self.report_video_webpage_download(video_id)
2761 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2763 page = urllib2.urlopen(request)
2764 video_webpage = page.read()
2765 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2766 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2769 # Start extracting information
2770 self.report_information_extraction(video_id)
2772 # Extract information
2773 video_info = self._parse_page(video_webpage)
2776 if 'owner' not in video_info:
2777 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2779 video_uploader = video_info['owner']
2782 if 'title' not in video_info:
2783 self._downloader.trouble(u'ERROR: unable to extract video title')
2785 video_title = video_info['title']
2786 video_title = video_title.decode('utf-8')
2787 video_title = sanitize_title(video_title)
2790 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2791 simple_title = simple_title.strip(ur'_')
2794 if 'thumbnail' not in video_info:
2795 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2796 video_thumbnail = ''
2798 video_thumbnail = video_info['thumbnail']
2802 if 'upload_date' in video_info:
2803 upload_time = video_info['upload_date']
2804 timetuple = email.utils.parsedate_tz(upload_time)
2805 if timetuple is not None:
2807 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2812 video_description = video_info.get('description', 'No description available.')
2814 url_map = video_info['video_urls']
2815 if len(url_map.keys()) > 0:
2816 # Decide which formats to download
2817 req_format = self._downloader.params.get('format', None)
2818 format_limit = self._downloader.params.get('format_limit', None)
2820 if format_limit is not None and format_limit in self._available_formats:
2821 format_list = self._available_formats[self._available_formats.index(format_limit):]
2823 format_list = self._available_formats
2824 existing_formats = [x for x in format_list if x in url_map]
2825 if len(existing_formats) == 0:
2826 self._downloader.trouble(u'ERROR: no known formats available for video')
2828 if req_format is None:
2829 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2830 elif req_format == '-1':
2831 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2834 if req_format not in url_map:
2835 self._downloader.trouble(u'ERROR: requested format not available')
2837 video_url_list = [(req_format, url_map[req_format])] # Specific format
2839 for format_param, video_real_url in video_url_list:
2841 # At this point we have a new video
2842 self._downloader.increment_downloads()
2845 video_extension = self._video_extensions.get(format_param, 'mp4')
2848 # Process video information
2849 self._downloader.process_info({
2850 'id': video_id.decode('utf-8'),
2851 'url': video_real_url.decode('utf-8'),
2852 'uploader': video_uploader.decode('utf-8'),
2853 'upload_date': upload_date,
2854 'title': video_title,
2855 'stitle': simple_title,
2856 'ext': video_extension.decode('utf-8'),
2857 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2858 'thumbnail': video_thumbnail.decode('utf-8'),
2859 'description': video_description.decode('utf-8'),
2862 except UnavailableVideoError, err:
2863 self._downloader.trouble(u'\nERROR: unable to download video')
2865 class BlipTVIE(InfoExtractor):
2866 """Information extractor for blip.tv"""
2868 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2869 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2870 IE_NAME = u'blip.tv'
2872 def report_extraction(self, file_id):
2873 """Report information extraction."""
2874 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2876 def _simplify_title(self, title):
2877 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2878 res = res.strip(ur'_')
2881 def _real_extract(self, url):
2882 mobj = re.match(self._VALID_URL, url)
2884 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2891 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2892 request = urllib2.Request(json_url)
2893 self.report_extraction(mobj.group(1))
2895 json_code = urllib2.urlopen(request).read()
2896 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2897 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2900 json_data = json.loads(json_code)
2901 if 'Post' in json_data:
2902 data = json_data['Post']
2906 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2907 video_url = data['media']['url']
2908 umobj = re.match(self._URL_EXT, video_url)
2910 raise ValueError('Can not determine filename extension')
2911 ext = umobj.group(1)
2913 self._downloader.increment_downloads()
2916 'id': data['item_id'],
2918 'uploader': data['display_name'],
2919 'upload_date': upload_date,
2920 'title': data['title'],
2921 'stitle': self._simplify_title(data['title']),
2923 'format': data['media']['mimeType'],
2924 'thumbnail': data['thumbnailUrl'],
2925 'description': data['description'],
2926 'player_url': data['embedUrl']
2928 except (ValueError,KeyError), err:
2929 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2933 self._downloader.process_info(info)
2934 except UnavailableVideoError, err:
2935 self._downloader.trouble(u'\nERROR: unable to download video')
2938 class MyVideoIE(InfoExtractor):
2939 """Information Extractor for myvideo.de."""
2941 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2942 IE_NAME = u'myvideo'
2944 def __init__(self, downloader=None):
2945 InfoExtractor.__init__(self, downloader)
2947 def report_download_webpage(self, video_id):
2948 """Report webpage download."""
2949 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2951 def report_extraction(self, video_id):
2952 """Report information extraction."""
2953 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2955 def _real_initialize(self):
2958 def _real_extract(self,url):
2959 mobj = re.match(self._VALID_URL, url)
2961 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2964 video_id = mobj.group(1)
2965 simple_title = mobj.group(2).decode('utf-8')
2966 # should actually not be necessary
2967 simple_title = sanitize_title(simple_title)
2968 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
2971 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2973 self.report_download_webpage(video_id)
2974 webpage = urllib2.urlopen(request).read()
2975 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2976 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2979 self.report_extraction(video_id)
2980 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2983 self._downloader.trouble(u'ERROR: unable to extract media URL')
2985 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2987 mobj = re.search('<title>([^<]+)</title>', webpage)
2989 self._downloader.trouble(u'ERROR: unable to extract title')
2992 video_title = mobj.group(1)
2993 video_title = sanitize_title(video_title)
2997 self._downloader.process_info({
3001 'upload_date': u'NA',
3002 'title': video_title,
3003 'stitle': simple_title,
3008 except UnavailableVideoError:
3009 self._downloader.trouble(u'\nERROR: Unable to download video')
3011 class ComedyCentralIE(InfoExtractor):
3012 """Information extractor for The Daily Show and Colbert Report """
3014 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3015 IE_NAME = u'comedycentral'
3017 def report_extraction(self, episode_id):
3018 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3020 def report_config_download(self, episode_id):
3021 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3023 def report_index_download(self, episode_id):
3024 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3026 def report_player_url(self, episode_id):
3027 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3029 def _simplify_title(self, title):
3030 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3031 res = res.strip(ur'_')
3034 def _real_extract(self, url):
3035 mobj = re.match(self._VALID_URL, url)
3037 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3040 if mobj.group('shortname'):
3041 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3042 url = 'http://www.thedailyshow.com/full-episodes/'
3044 url = 'http://www.colbertnation.com/full-episodes/'
3045 mobj = re.match(self._VALID_URL, url)
3046 assert mobj is not None
3048 dlNewest = not mobj.group('episode')
3050 epTitle = mobj.group('showname')
3052 epTitle = mobj.group('episode')
3054 req = urllib2.Request(url)
3055 self.report_extraction(epTitle)
3057 htmlHandle = urllib2.urlopen(req)
3058 html = htmlHandle.read()
3059 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3060 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3063 url = htmlHandle.geturl()
3064 mobj = re.match(self._VALID_URL, url)
3066 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3068 if mobj.group('episode') == '':
3069 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3071 epTitle = mobj.group('episode')
3073 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3074 if len(mMovieParams) == 0:
3075 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3078 playerUrl_raw = mMovieParams[0][0]
3079 self.report_player_url(epTitle)
3081 urlHandle = urllib2.urlopen(playerUrl_raw)
3082 playerUrl = urlHandle.geturl()
3083 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3084 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3087 uri = mMovieParams[0][1]
3088 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3089 self.report_index_download(epTitle)
3091 indexXml = urllib2.urlopen(indexUrl).read()
3092 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3093 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3096 idoc = xml.etree.ElementTree.fromstring(indexXml)
3097 itemEls = idoc.findall('.//item')
3098 for itemEl in itemEls:
3099 mediaId = itemEl.findall('./guid')[0].text
3100 shortMediaId = mediaId.split(':')[-1]
3101 showId = mediaId.split(':')[-2].replace('.com', '')
3102 officialTitle = itemEl.findall('./title')[0].text
3103 officialDate = itemEl.findall('./pubDate')[0].text
3105 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3106 urllib.urlencode({'uri': mediaId}))
3107 configReq = urllib2.Request(configUrl)
3108 self.report_config_download(epTitle)
3110 configXml = urllib2.urlopen(configReq).read()
3111 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3112 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3115 cdoc = xml.etree.ElementTree.fromstring(configXml)
3117 for rendition in cdoc.findall('.//rendition'):
3118 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3122 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3125 # For now, just pick the highest bitrate
3126 format,video_url = turls[-1]
3128 self._downloader.increment_downloads()
3130 effTitle = showId + '-' + epTitle
3135 'upload_date': officialDate,
3137 'stitle': self._simplify_title(effTitle),
3141 'description': officialTitle,
3142 'player_url': playerUrl
3146 self._downloader.process_info(info)
3147 except UnavailableVideoError, err:
3148 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3152 class EscapistIE(InfoExtractor):
3153 """Information extractor for The Escapist """
3155 _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?].*$'
3156 IE_NAME = u'escapist'
3158 def report_extraction(self, showName):
3159 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3161 def report_config_download(self, showName):
3162 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3164 def _simplify_title(self, title):
3165 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3166 res = res.strip(ur'_')
3169 def _real_extract(self, url):
3170 htmlParser = HTMLParser.HTMLParser()
3172 mobj = re.match(self._VALID_URL, url)
3174 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3176 showName = mobj.group('showname')
3177 videoId = mobj.group('episode')
3179 self.report_extraction(showName)
3181 webPage = urllib2.urlopen(url).read()
3182 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3183 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3186 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3187 description = htmlParser.unescape(descMatch.group(1))
3188 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3189 imgUrl = htmlParser.unescape(imgMatch.group(1))
3190 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3191 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3192 configUrlMatch = re.search('config=(.*)$', playerUrl)
3193 configUrl = urllib2.unquote(configUrlMatch.group(1))
3195 self.report_config_download(showName)
3197 configJSON = urllib2.urlopen(configUrl).read()
3198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3202 # Technically, it's JavaScript, not JSON
3203 configJSON = configJSON.replace("'", '"')
3206 config = json.loads(configJSON)
3207 except (ValueError,), err:
3208 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3211 playlist = config['playlist']
3212 videoUrl = playlist[1]['url']
3214 self._downloader.increment_downloads()
3218 'uploader': showName,
3219 'upload_date': None,
3221 'stitle': self._simplify_title(showName),
3224 'thumbnail': imgUrl,
3225 'description': description,
3226 'player_url': playerUrl,
3230 self._downloader.process_info(info)
3231 except UnavailableVideoError, err:
3232 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3236 class PostProcessor(object):
3237 """Post Processor class.
3239 PostProcessor objects can be added to downloaders with their
3240 add_post_processor() method. When the downloader has finished a
3241 successful download, it will take its internal chain of PostProcessors
3242 and start calling the run() method on each one of them, first with
3243 an initial argument and then with the returned value of the previous
3246 The chain will be stopped if one of them ever returns None or the end
3247 of the chain is reached.
3249 PostProcessor objects follow a "mutual registration" process similar
3250 to InfoExtractor objects.
3255 def __init__(self, downloader=None):
3256 self._downloader = downloader
3258 def set_downloader(self, downloader):
3259 """Sets the downloader for this PP."""
3260 self._downloader = downloader
3262 def run(self, information):
3263 """Run the PostProcessor.
3265 The "information" argument is a dictionary like the ones
3266 composed by InfoExtractors. The only difference is that this
3267 one has an extra field called "filepath" that points to the
3270 When this method returns None, the postprocessing chain is
3271 stopped. However, this method may return an information
3272 dictionary that will be passed to the next postprocessing
3273 object in the chain. It can be the one it received after
3274 changing some fields.
3276 In addition, this method may raise a PostProcessingError
3277 exception that will be taken into account by the downloader
3280 return information # by default, do nothing
3283 class FFmpegExtractAudioPP(PostProcessor):
3285 def __init__(self, downloader=None, preferredcodec=None):
3286 PostProcessor.__init__(self, downloader)
3287 if preferredcodec is None:
3288 preferredcodec = 'best'
3289 self._preferredcodec = preferredcodec
3292 def get_audio_codec(path):
3294 cmd = ['ffprobe', '-show_streams', '--', path]
3295 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3296 output = handle.communicate()[0]
3297 if handle.wait() != 0:
3299 except (IOError, OSError):
3302 for line in output.split('\n'):
3303 if line.startswith('codec_name='):
3304 audio_codec = line.split('=')[1].strip()
3305 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3310 def run_ffmpeg(path, out_path, codec, more_opts):
3312 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3313 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3315 except (IOError, OSError):
3318 def run(self, information):
3319 path = information['filepath']
3321 filecodec = self.get_audio_codec(path)
3322 if filecodec is None:
3323 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3327 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3328 if filecodec == 'aac' or filecodec == 'mp3':
3329 # Lossless if possible
3331 extension = filecodec
3332 if filecodec == 'aac':
3333 more_opts = ['-f', 'adts']
3336 acodec = 'libmp3lame'
3338 more_opts = ['-ab', '128k']
3340 # We convert the audio (lossy)
3341 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
3342 extension = self._preferredcodec
3343 more_opts = ['-ab', '128k']
3344 if self._preferredcodec == 'aac':
3345 more_opts += ['-f', 'adts']
3347 (prefix, ext) = os.path.splitext(path)
3348 new_path = prefix + '.' + extension
3349 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3350 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3353 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3358 except (IOError, OSError):
3359 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3362 information['filepath'] = new_path
3366 def updateSelf(downloader, filename):
3367 ''' Update the program file with the latest version from the repository '''
3368 # Note: downloader only used for options
3369 if not os.access(filename, os.W_OK):
3370 sys.exit('ERROR: no write permissions on %s' % filename)
3372 downloader.to_screen('Updating to latest version...')
3376 urlh = urllib.urlopen(UPDATE_URL)
3377 newcontent = urlh.read()
3380 except (IOError, OSError), err:
3381 sys.exit('ERROR: unable to download latest version')
3384 outf = open(filename, 'wb')
3386 outf.write(newcontent)
3389 except (IOError, OSError), err:
3390 sys.exit('ERROR: unable to overwrite current version')
3392 downloader.to_screen('Updated youtube-dl. Restart to use the new version.')
3399 def _format_option_string(option):
3400 ''' ('-o', '--option') -> -o, --format METAVAR'''
3404 if option._short_opts: opts.append(option._short_opts[0])
3405 if option._long_opts: opts.append(option._long_opts[0])
3406 if len(opts) > 1: opts.insert(1, ', ')
3408 if option.takes_value(): opts.append(' %s' % option.metavar)
3410 return "".join(opts)
3412 def _find_term_columns():
3413 columns = os.environ.get('COLUMNS', None)
3418 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3419 out,err = sp.communicate()
3420 return int(out.split()[1])
3426 max_help_position = 80
3428 # No need to wrap help messages if we're on a wide console
3429 columns = _find_term_columns()
3430 if columns: max_width = columns
3432 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3433 fmt.format_option_strings = _format_option_string
3436 'version' : __version__,
3438 'usage' : '%prog [options] url [url...]',
3439 'conflict_handler' : 'resolve',
3442 parser = optparse.OptionParser(**kw)
3445 general = optparse.OptionGroup(parser, 'General Options')
3446 selection = optparse.OptionGroup(parser, 'Video Selection')
3447 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3448 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3449 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3450 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3451 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3453 general.add_option('-h', '--help',
3454 action='help', help='print this help text and exit')
3455 general.add_option('-v', '--version',
3456 action='version', help='print program version and exit')
3457 general.add_option('-U', '--update',
3458 action='store_true', dest='update_self', help='update this program to latest version')
3459 general.add_option('-i', '--ignore-errors',
3460 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3461 general.add_option('-r', '--rate-limit',
3462 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3463 general.add_option('-R', '--retries',
3464 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3465 general.add_option('--dump-user-agent',
3466 action='store_true', dest='dump_user_agent',
3467 help='display the current browser identification', default=False)
3468 general.add_option('--list-extractors',
3469 action='store_true', dest='list_extractors',
3470 help='List all supported extractors and the URLs they would handle', default=False)
3472 selection.add_option('--playlist-start',
3473 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3474 selection.add_option('--playlist-end',
3475 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3476 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3477 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3479 authentication.add_option('-u', '--username',
3480 dest='username', metavar='USERNAME', help='account username')
3481 authentication.add_option('-p', '--password',
3482 dest='password', metavar='PASSWORD', help='account password')
3483 authentication.add_option('-n', '--netrc',
3484 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3487 video_format.add_option('-f', '--format',
3488 action='store', dest='format', metavar='FORMAT', help='video format code')
3489 video_format.add_option('--all-formats',
3490 action='store_const', dest='format', help='download all available video formats', const='-1')
3491 video_format.add_option('--max-quality',
3492 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3495 verbosity.add_option('-q', '--quiet',
3496 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3497 verbosity.add_option('-s', '--simulate',
3498 action='store_true', dest='simulate', help='do not download video', default=False)
3499 verbosity.add_option('-g', '--get-url',
3500 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3501 verbosity.add_option('-e', '--get-title',
3502 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3503 verbosity.add_option('--get-thumbnail',
3504 action='store_true', dest='getthumbnail',
3505 help='simulate, quiet but print thumbnail URL', default=False)
3506 verbosity.add_option('--get-description',
3507 action='store_true', dest='getdescription',
3508 help='simulate, quiet but print video description', default=False)
3509 verbosity.add_option('--get-filename',
3510 action='store_true', dest='getfilename',
3511 help='simulate, quiet but print output filename', default=False)
3512 verbosity.add_option('--no-progress',
3513 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3514 verbosity.add_option('--console-title',
3515 action='store_true', dest='consoletitle',
3516 help='display progress in console titlebar', default=False)
3519 filesystem.add_option('-t', '--title',
3520 action='store_true', dest='usetitle', help='use title in file name', default=False)
3521 filesystem.add_option('-l', '--literal',
3522 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3523 filesystem.add_option('-A', '--auto-number',
3524 action='store_true', dest='autonumber',
3525 help='number downloaded files starting from 00000', default=False)
3526 filesystem.add_option('-o', '--output',
3527 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3528 filesystem.add_option('-a', '--batch-file',
3529 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3530 filesystem.add_option('-w', '--no-overwrites',
3531 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3532 filesystem.add_option('-c', '--continue',
3533 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3534 filesystem.add_option('--cookies',
3535 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3536 filesystem.add_option('--no-part',
3537 action='store_true', dest='nopart', help='do not use .part files', default=False)
3538 filesystem.add_option('--no-mtime',
3539 action='store_false', dest='updatetime',
3540 help='do not use the Last-modified header to set the file modification time', default=True)
3541 filesystem.add_option('--write-description',
3542 action='store_true', dest='writedescription',
3543 help='write video description to a .description file', default=False)
3544 filesystem.add_option('--write-info-json',
3545 action='store_true', dest='writeinfojson',
3546 help='write video metadata to a .info.json file', default=False)
3549 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3550 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3551 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3552 help='"best", "aac" or "mp3"; best by default')
3555 parser.add_option_group(general)
3556 parser.add_option_group(selection)
3557 parser.add_option_group(filesystem)
3558 parser.add_option_group(verbosity)
3559 parser.add_option_group(video_format)
3560 parser.add_option_group(authentication)
3561 parser.add_option_group(postproc)
3563 opts, args = parser.parse_args()
3565 return parser, opts, args
3567 def gen_extractors():
3568 """ Return a list of an instance of every supported extractor.
3569 The order does matter; the first extractor matched is the one handling the URL.
3571 youtube_ie = YoutubeIE()
3572 google_ie = GoogleIE()
3573 yahoo_ie = YahooIE()
3576 MetacafeIE(youtube_ie),
3578 YoutubePlaylistIE(youtube_ie),
3579 YoutubeUserIE(youtube_ie),
3580 YoutubeSearchIE(youtube_ie),
3582 GoogleSearchIE(google_ie),
3585 YahooSearchIE(yahoo_ie),
3598 parser, opts, args = parseOpts()
3600 # Open appropriate CookieJar
3601 if opts.cookiefile is None:
3602 jar = cookielib.CookieJar()
3605 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3606 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3608 except (IOError, OSError), err:
3609 sys.exit(u'ERROR: unable to open cookie file')
3612 if opts.dump_user_agent:
3613 print std_headers['User-Agent']
3616 # Batch file verification
3618 if opts.batchfile is not None:
3620 if opts.batchfile == '-':
3623 batchfd = open(opts.batchfile, 'r')
3624 batchurls = batchfd.readlines()
3625 batchurls = [x.strip() for x in batchurls]
3626 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3628 sys.exit(u'ERROR: batch file could not be read')
3629 all_urls = batchurls + args
3631 # General configuration
3632 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3633 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3634 urllib2.install_opener(opener)
3635 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3637 extractors = gen_extractors()
3639 if opts.list_extractors:
3640 for ie in extractors:
3642 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3643 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3644 for mu in matchedUrls:
3648 # Conflicting, missing and erroneous options
3649 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3650 parser.error(u'using .netrc conflicts with giving username/password')
3651 if opts.password is not None and opts.username is None:
3652 parser.error(u'account username missing')
3653 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3654 parser.error(u'using output template conflicts with using title, literal title or auto number')
3655 if opts.usetitle and opts.useliteral:
3656 parser.error(u'using title conflicts with using literal title')
3657 if opts.username is not None and opts.password is None:
3658 opts.password = getpass.getpass(u'Type account password and press return:')
3659 if opts.ratelimit is not None:
3660 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3661 if numeric_limit is None:
3662 parser.error(u'invalid rate limit specified')
3663 opts.ratelimit = numeric_limit
3664 if opts.retries is not None:
3666 opts.retries = long(opts.retries)
3667 except (TypeError, ValueError), err:
3668 parser.error(u'invalid retry count specified')
3670 opts.playliststart = int(opts.playliststart)
3671 if opts.playliststart <= 0:
3672 raise ValueError(u'Playlist start must be positive')
3673 except (TypeError, ValueError), err:
3674 parser.error(u'invalid playlist start number specified')
3676 opts.playlistend = int(opts.playlistend)
3677 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3678 raise ValueError(u'Playlist end must be greater than playlist start')
3679 except (TypeError, ValueError), err:
3680 parser.error(u'invalid playlist end number specified')
3681 if opts.extractaudio:
3682 if opts.audioformat not in ['best', 'aac', 'mp3']:
3683 parser.error(u'invalid audio format specified')
3686 fd = FileDownloader({
3687 'usenetrc': opts.usenetrc,
3688 'username': opts.username,
3689 'password': opts.password,
3690 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3691 'forceurl': opts.geturl,
3692 'forcetitle': opts.gettitle,
3693 'forcethumbnail': opts.getthumbnail,
3694 'forcedescription': opts.getdescription,
3695 'forcefilename': opts.getfilename,
3696 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3697 'format': opts.format,
3698 'format_limit': opts.format_limit,
3699 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3700 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3701 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3702 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3703 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3704 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3705 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3706 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3707 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3708 or u'%(id)s.%(ext)s'),
3709 'ignoreerrors': opts.ignoreerrors,
3710 'ratelimit': opts.ratelimit,
3711 'nooverwrites': opts.nooverwrites,
3712 'retries': opts.retries,
3713 'continuedl': opts.continue_dl,
3714 'noprogress': opts.noprogress,
3715 'playliststart': opts.playliststart,
3716 'playlistend': opts.playlistend,
3717 'logtostderr': opts.outtmpl == '-',
3718 'consoletitle': opts.consoletitle,
3719 'nopart': opts.nopart,
3720 'updatetime': opts.updatetime,
3721 'writedescription': opts.writedescription,
3722 'writeinfojson': opts.writeinfojson,
3723 'matchtitle': opts.matchtitle,
3724 'rejecttitle': opts.rejecttitle,
3726 for extractor in extractors:
3727 fd.add_info_extractor(extractor)
3730 if opts.extractaudio:
3731 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3734 if opts.update_self:
3735 updateSelf(fd, sys.argv[0])
3738 if len(all_urls) < 1:
3739 if not opts.update_self:
3740 parser.error(u'you must provide at least one URL')
3743 retcode = fd.download(all_urls)
3745 # Dump cookie jar if requested
3746 if opts.cookiefile is not None:
3749 except (IOError, OSError), err:
3750 sys.exit(u'ERROR: unable to save cookie jar')
3755 if __name__ == '__main__':
3758 except DownloadError:
3760 except SameFileError:
3761 sys.exit(u'ERROR: fixed output name but more than one file to download')
3762 except KeyboardInterrupt:
3763 sys.exit(u'\nERROR: Interrupted by user')
3765 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: