New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unique names for dumped files with --write-pages
#9773
Comments
This comment was marked as duplicate.
This comment was marked as duplicate.
Adding timestamp will break |
Encoding all the posted data would make the filename really long, but a hash of the post data should be unique enough, without making the filename too long. |
In my environment, the following patch works correctly. # This patch is in the public domain.
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1005,8 +1005,9 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, data, video_id):
+ hash = hashlib.sha256(str(data).encode('utf-8')).hexdigest()
+ basen = f'{video_id}_{hash}_{url}'
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1037,7 +1038,7 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, urlh._requests_response.request.__dict__['body'], video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1099,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, url_or_request.data, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf: |
|
Okay, how about this? # This patch is in the public domain (CC0).
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -957,7 +957,7 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data)
return (content, urlh)
@staticmethod
@@ -1005,8 +1005,9 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, data, video_id):
+ hash = hashlib.sha256(str(data).encode('utf-8')).hexdigest()
+ basen = f'{video_id}_{hash}_{url}'
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1028,7 +1029,7 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
@@ -1037,7 +1038,7 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, data, video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1099,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, url_or_request.data, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf: |
@minamotorin I think that basically works. I would suggest a few changes though: diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index bebbc6b43..be9f07c38 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data)
return (content, urlh)
@staticmethod
@@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, video_id, data=None):
+ if data is not None:
+ data = hashlib.md5(data).hexdigest()
+ basen = join_nonempty(video_id, data, url, delim='_')
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1028,16 +1031,19 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+ prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
+ if isinstance(url_or_request, Request):
+ data = data if data is not None else url_or_request.data
if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, video_id, data)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1104,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf: |
DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE
Checklist
Provide a description that is worded well enough to be understood
Thanks for the great program.
When I debugging for #9358, I found that the
--write-pages
option dumps diffirent responses to the same file.The cause is that the dump file name is specified with only
video_id
andurl
in_request_dump_filename
inextractor/common.py
.YouTube comment API is usually same URL, thus these responses will be stored in the same file:
None_https_-_www.youtube.com_youtubei_v1_nextkey=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8_prettyPrint=false.dump
.If the file already exists, it will be overwritten regardless of
--no-overwrites
option.This is inconvenient for debugging.
I suggest to give the unique names for every dump files.
Personally, I applied the following patch myself when debugging.
Thanks.
Provide verbose output that clearly demonstrates the problem
yt-dlp -vU <your command line>
)'verbose': True
toYoutubeDL
params instead[debug] Command-line config
) and insert it belowComplete Verbose Output
No response
The text was updated successfully, but these errors were encountered: