1 回答
TA贡献1828条经验 获得超6个赞
我最近一直在玩 igraph 和 API,所以这是相当新鲜的。我认为下面的代码可以满足您的需求,但它确实省略了一些复杂性(例如不会使 API 超时)。它不是非常快 - 我怀疑其中很多与使用 as_data_frame 接口来跟踪顶点有关。
所以我确信它可以被优化并且我确信在某个时候 API 会以一种破坏它的编码返回一些东西,但这是一个开始。
library(igraph)
api_fetch <- function(query){
result <- jsonlite::fromJSON(paste0('http://suggestqueries.google.com/complete/search?client=firefox&q=', httpuv::encodeURIComponent(query)))
return(result)
}
build_query_graph <- function(entry_word, max_depth=2){
# Create an empty graph
graph <- make_empty_graph()
entry_word <- tolower(trimws(entry_word))
graph <- add_vertices(graph, 1, name=entry_word, searched=FALSE)
# Keep on doing this until the graph hits the maximum depth from the entry word
while(TRUE){
# Look up the current vertices and find their depths from the entry word
vertices <- as_data_frame(graph, what='vertices')
vertex_depth <- distances(graph, v=entry_word)
vertices$depth <- vertex_depth[match(colnames(vertex_depth), vertices$name)]
# Find vertices at least one step from the maximum depth and that haven't
# already been searched and sort to get the shallowest at the top
live_vertices <- subset(vertices, depth <= (max_depth - 1) & ! searched)
live_vertices <- live_vertices[order(live_vertices$depth),]
# If there are any vertices meeting these criteria, then query the API
# otherwise bail from the while loop
if(nrow(live_vertices)){
# Get the vertex name and query it
this_vertex <- live_vertices$name[1]
res <- api_fetch(this_vertex)
# For each of the daughter results, check it isn't already a vertex
# and add an edge from it to this_vertex
for(daughter in res[[2]]){
if(! daughter %in% get.vertex.attribute(graph, 'name')){
graph <- add_vertices(graph, 1, name=daughter, searched=FALSE)
}
graph <- add_edges(graph, c(this_vertex, daughter))
}
# Don't search this vertex again
graph <- set_vertex_attr(graph, 'searched', this_vertex, TRUE)
} else {
break
}
}
return(graph)
}
运行:
> g <- build_query_graph('amazon')
> g
IGRAPH 0ec19b6 DN-- 90 100 --
+ attr: name (v/c), searched (v/l)
+ edges from 0ec19b6 (vertex names):
[1] amazon ->amazon amazon ->amazon prime amazon ->amazon prime video
[4] amazon ->amazon uk amazon ->amazon music amazon ->amazon smile
[7] amazon ->amazon india amazon ->amazon jobs amazon ->amazon video
[10] amazon ->amazon customer service amazon prime ->amazon prime amazon prime ->amazon prime video
[13] amazon prime ->amazon prime movies amazon prime ->amazon prime music amazon prime ->amazon prime now
[16] amazon prime ->amazon prime login amazon prime ->amazon prime uk amazon prime ->amazon prime tv
[19] amazon prime ->amazon prime cost amazon prime ->amazon prime student amazon prime video->amazon prime video
[22] amazon prime video->amazon prime video login amazon prime video->amazon prime video app amazon prime video->amazon prime video uk
+ ... omitted several edges
> plot(g)
考虑一下,重复重新计算所有距离并进行大量排序和匹配。在创建单个顶点时保存它们的深度可能会更快:
build_query_graph <- function(entry_word, max_depth=2){
# Create an empty graph
graph <- make_empty_graph()
entry_word <- tolower(trimws(entry_word))
graph <- add_vertices(graph, 1, name=entry_word, depth=0, searched=FALSE)
# Keep on doing this until the graph hits the maximum depth from the entry word
while(TRUE){
# Look up the current vertices and find their depths from the entry word
vertices <- as_data_frame(graph, what='vertices')
# Find vertices at least one step from the maximum depth and that haven't
# already been searched and sort to get the shallowest at the top
live_vertices <- subset(vertices, depth <= (max_depth - 1) & ! searched)
live_vertices <- live_vertices[order(live_vertices$depth),]
# If there are any vertices meeting these criteria, then query the API
# otherwise bail from the while loop
if(nrow(live_vertices)){
# Get the vertex name and query it
this_vertex <- live_vertices$name[1]
res <- api_fetch(this_vertex)
# For each of the daughter results, check it isn't already a vertex
# add an edge from it to this_vertex and store the depth from the entry word
for(daughter in res[[2]]){
if(! daughter %in% get.vertex.attribute(graph, 'name')){
graph <- add_vertices(graph, 1, name=daughter, depth=NA, searched=FALSE)
}
graph <- add_edges(graph, c(this_vertex, daughter))
graph <- set_vertex_attr(graph, 'depth', daughter,
distances(graph, v=entry_word, to=daughter))
}
# Don't search this vertex again
graph <- set_vertex_attr(graph, 'searched', this_vertex, TRUE)
} else {
break
}
}
return(graph)
}
添加回答
举报