You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hey folks, I've been trying to do some searching by url (to check for duplicate articles) and it never seems to return anything. As you can see below, I have tried a lot, and just can't get URL based search to work!
My code is below. Searching by title absolutely works, but URLs just never seem to return anything. What's going on?
importosimportasyncioimporthttpxfromdotenvimportload_dotenvfromtypingimportDict, Any, Listfromurllib.parseimportquote, quote_plus, urlparse, urlunparseload_dotenv()
API_URL="https://api-prod.omnivore.app/api/graphql"API_KEY=os.getenv("OMNIVORE_API_KEY")
asyncdefquery_omnivore(query: str, variables: Dict[str, Any] =None) ->Dict[str, Any]:
headers= {
"Content-Type": "application/json",
"Authorization": API_KEY
}
payload= {
"query": query,
"variables": variablesor {}
}
asyncwithhttpx.AsyncClient() asclient:
response=awaitclient.post(API_URL, headers=headers, json=payload)
response.raise_for_status()
returnresponse.json()
asyncdefsearch_with_strategy(strategy: str, search_term: str) ->List[Dict[str, Any]]:
query=""" query Search($after: String, $first: Int, $query: String) { search(after: $after, first: $first, query: $query) { ... on SearchSuccess { edges { node { id url title createdAt } } } ... on SearchError { errorCodes } } } """variables= {
"query": f"{strategy}:{search_term}",
"first": 10
}
result=awaitquery_omnivore(query, variables)
if"data"inresultand"search"inresult["data"] and"edges"inresult["data"]["search"]:
return [edge["node"] foredgeinresult["data"]["search"]["edges"]]
return []
asyncdefget_article_by_id(article_id: str) ->Dict[str, Any]:
query=""" query Article($id: ID!) { article(id: $id) { ... on ArticleSuccess { article { id title url originalArticleUrl createdAt savedAt publishedAt content } } ... on ArticleError { errorCodes } } } """variables= {"id": article_id}
result=awaitquery_omnivore(query, variables)
if"data"inresultand"article"inresult["data"] and"article"inresult["data"]["article"]:
returnresult["data"]["article"]["article"]
return {}
defgenerate_partial_url_matches(url: str) ->List[str]:
parsed=urlparse(url)
path_parts=parsed.path.split('/')
variations= [
url,
url.rstrip('/'),
parsed.netloc+parsed.path,
parsed.path.lstrip('/'),
'/'.join(path_parts[-2:]) iflen(path_parts) >1elsepath_parts[-1],
path_parts[-1],
'-'.join(path_parts[-1].split('-')[:3]) if'-'inpath_parts[-1] elsepath_parts[-1],
parsed.netloc.split('.')[-2] iflen(parsed.netloc.split('.')) >1elseparsed.netloc,
]
# Add variations with different numbers of path segmentsforiinrange(1, len(path_parts) +1):
variations.append('/'.join(path_parts[-i:]))
# Add variations with domain and different numbers of path segmentsforiinrange(1, len(path_parts) +1):
variations.append(parsed.netloc+'/'+'/'.join(path_parts[-i:]))
returnlist(set(variations)) # Remove duplicatesasyncdefdebug_partial_url_search(url: str, title: str):
print(f"Debugging partial URL search for: {url}")
print(f"Article title: {title}")
partial_matches=generate_partial_url_matches(url)
forpartialinpartial_matches:
print(f"\nTrying partial match: {partial}")
# Try exact matchresults=awaitsearch_with_strategy("url", f'"{partial}"')
ifresults:
print(f"Found results for exact partial match: {partial}")
forresultinresults:
print(f" ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
else:
print("No results found for exact partial match")
# Try contains matchresults=awaitsearch_with_strategy("url", f'*{partial}*')
ifresults:
print(f"Found results for contains partial match: {partial}")
forresultinresults:
print(f" ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
else:
print("No results found for contains partial match")
# Try starts with matchresults=awaitsearch_with_strategy("url", f'{partial}*')
ifresults:
print(f"Found results for starts with partial match: {partial}")
forresultinresults:
print(f" ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
else:
print("No results found for starts with partial match")
# Try ends with matchresults=awaitsearch_with_strategy("url", f'*{partial}')
ifresults:
print(f"Found results for ends with partial match: {partial}")
forresultinresults:
print(f" ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
else:
print("No results found for ends with partial match")
# Try title search as a fallbackprint("\nTrying title search as fallback")
results=awaitsearch_with_strategy("title", f'"{title}"')
ifresults:
print("Found articles by title search:")
forresultinresults:
print(f" ID: {result['id']}, Title: {result['title']}, URL: {result['url']}")
# Fetch full article detailsarticle_details=awaitget_article_by_id(result['id'])
ifarticle_details:
print(" Article details:")
print(f" Stored URL: {article_details.get('url')}")
print(f" Original URL: {article_details.get('originalArticleUrl')}")
print(f" Created At: {article_details.get('createdAt')}")
print(f" Saved At: {article_details.get('savedAt')}")
print(f" Published At: {article_details.get('publishedAt')}")
content=article_details.get('content', '')[:100]
print(f" Content preview: {content}...")
else:
print("No articles found by title search")
asyncdefmain():
url_to_debug="https://www.nature.com/articles/d41586-024-02012-5"title_to_debug="Not all"awaitdebug_partial_url_search(url_to_debug, title_to_debug)
if__name__=="__main__":
asyncio.run(main())
The text was updated successfully, but these errors were encountered:
Hey folks, I've been trying to do some searching by url (to check for duplicate articles) and it never seems to return anything. As you can see below, I have tried a lot, and just can't get URL based search to work!
My code is below. Searching by title absolutely works, but URLs just never seem to return anything. What's going on?
The text was updated successfully, but these errors were encountered: