Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to fix ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60) #404

Open
vriez opened this issue Jan 29, 2024 · 1 comment

Comments

@vriez
Copy link

vriez commented Jan 29, 2024

Upon installation,

pip install tika

When attempting:

In [21]: import tika
    ...: tika.initVM()
    ...: from tika import parser

In [22]: parsed = parser.from_file(file_path)

I get

---------------------------------------------------------------------------
timeout                                   Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:466, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    462         except BaseException as e:
    463             # Remove the TypeError from the exception chain in
    464             # Python 3 (including for exceptions like SystemExit).
    465             # Otherwise it looks like a bug in the code.
--> 466             six.raise_from(e, None)
    467 except (SocketTimeout, BaseSSLError, SocketError) as e:

File <string>:3, in raise_from(value, from_value)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:461, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    460 try:
--> 461     httplib_response = conn.getresponse()
    462 except BaseException as e:
    463     # Remove the TypeError from the exception chain in
    464     # Python 3 (including for exceptions like SystemExit).
    465     # Otherwise it looks like a bug in the code.

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:1348, in HTTPConnection.getresponse(self)
   1347 try:
-> 1348     response.begin()
   1349 except ConnectionError:

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:316, in HTTPResponse.begin(self)
    315 while True:
--> 316     version, status, reason = self._read_status()
    317     if status != CONTINUE:

File ~/anaconda3/envs/master/lib/python3.8/http/client.py:277, in HTTPResponse._read_status(self)
    276 def _read_status(self):
--> 277     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    278     if len(line) > _MAXLINE:

File ~/anaconda3/envs/master/lib/python3.8/socket.py:669, in SocketIO.readinto(self, b)
    668 try:
--> 669     return self._sock.recv_into(b)
    670 except timeout:

timeout: timed out

During handling of the above exception, another exception occurred:

ReadTimeoutError                          Traceback (most recent call last)
File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:486, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    485 try:
--> 486     resp = conn.urlopen(
    487         method=request.method,
    488         url=url,
    489         body=request.body,
    490         headers=request.headers,
    491         redirect=False,
    492         assert_same_host=False,
    493         preload_content=False,
    494         decode_content=False,
    495         retries=self.max_retries,
    496         timeout=timeout,
    497         chunked=chunked,
    498     )
    500 except (ProtocolError, OSError) as err:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:798, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    796     e = ProtocolError("Connection aborted.", e)
--> 798 retries = retries.increment(
    799     method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    800 )
    801 retries.sleep()

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
    549 if read is False or not self._is_method_retryable(method):
--> 550     raise six.reraise(type(error), error, _stacktrace)
    551 elif read is not None:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/packages/six.py:770, in reraise(tp, value, tb)
    769         raise value.with_traceback(tb)
--> 770     raise value
    771 finally:

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:714, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    713 # Make the request on the httplib connection object.
--> 714 httplib_response = self._make_request(
    715     conn,
    716     method,
    717     url,
    718     timeout=timeout_obj,
    719     body=body,
    720     headers=headers,
    721     chunked=chunked,
    722 )
    724 # If we're going to release the connection in ``finally:``, then
    725 # the response doesn't need to know about the connection. Otherwise
    726 # it will also try to release it and we'll have a double-release
    727 # mess.

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:468, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    467 except (SocketTimeout, BaseSSLError, SocketError) as e:
--> 468     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
    469     raise

File ~/anaconda3/envs/master/lib/python3.8/site-packages/urllib3/connectionpool.py:357, in HTTPConnectionPool._raise_timeout(self, err, url, timeout_value)
    356 if isinstance(err, SocketTimeout):
--> 357     raise ReadTimeoutError(
    358         self, url, "Read timed out. (read timeout=%s)" % timeout_value
    359     )
    361 # See the above comment about EAGAIN in Python 3. In Python 2 we have
    362 # to specifically catch it and throw the timeout error

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)

During handling of the above exception, another exception occurred:

ReadTimeout                               Traceback (most recent call last)
Cell In[22], line 1
----> 1 parsed = parser.from_file(file_path)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/parser.py:40, in from_file(filename, serverEndpoint, service, xmlContent, headers, config_path, requestOptions, raw_response)
     24 '''
     25 Parses a file for metadata and content
     26 :param filename: path to file which needs to be parsed or binary file using open(path,'rb')
   (...)
     37         'content' has a str value and metadata has a dict type value.
     38 '''
     39 if not xmlContent:
---> 40     output = parse1(service, filename, serverEndpoint, headers=headers, config_path=config_path, requestOptions=requestOptions)
     41 else:
     42     output = parse1(service, filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
     43                         headers=headers, config_path=config_path, requestOptions=requestOptions)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:337, in parse1(option, urlOrPath, serverEndpoint, verbose, tikaServerJar, responseMimeType, services, rawResponse, headers, config_path, requestOptions)
    335 headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    336 with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
--> 337     status, response = callServer('put', serverEndpoint, service, f,
    338                                   headers, verbose, tikaServerJar, config_path=config_path,
    339                                   rawResponse=rawResponse, requestOptions=requestOptions)
    341 if file_type == 'remote': os.unlink(path)
    342 return (status, response)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/tika/tika.py:555, in callServer(verb, serverEndpoint, service, data, headers, verbose, tikaServerJar, httpVerbs, classpath, rawResponse, config_path, requestOptions)
    552 effectiveRequestOptions = requestOptionsDefault.copy()
    553 effectiveRequestOptions.update(requestOptions)
--> 555 resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)
    557 if verbose:
    558     print(sys.stderr, "Request headers: ", headers)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:130, in put(url, data, **kwargs)
    118 def put(url, data=None, **kwargs):
    119     r"""Sends a PUT request.
    120 
    121     :param url: URL for the new :class:`Request` object.
   (...)
    127     :rtype: requests.Response
    128     """
--> 130     return request("put", url, data=data, **kwargs)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/api.py:59, in request(method, url, **kwargs)
     55 # By using the 'with' statement we are sure the session is closed, thus we
     56 # avoid leaving sockets open which can trigger a ResourceWarning in some
     57 # cases, and look like a memory leak in others.
     58 with sessions.Session() as session:
---> 59     return session.request(method=method, url=url, **kwargs)

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File ~/anaconda3/envs/master/lib/python3.8/site-packages/requests/adapters.py:532, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    530     raise SSLError(e, request=request)
    531 elif isinstance(e, ReadTimeoutError):
--> 532     raise ReadTimeout(e, request=request)
    533 elif isinstance(e, _InvalidHeader):
    534     raise InvalidHeader(e, request=request)

ReadTimeout: HTTPConnectionPool(host='localhost', port=9998): Read timed out. (read timeout=60)

In [23]: 

How can I overcome it?

@vriez
Copy link
Author

vriez commented Jan 29, 2024

Nevermind, I have missed

TIKA_SERVER_JAR="file:////tika-server-standard.jar

After setting this environment variable, it worked.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant