import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
val userGraph: Graph[(String, String), String]
Name: Compile Error
Message: <console>:30: error: class $iw needs to be abstract, since value userGraph is not defined
class $iw extends Serializable {
^
StackTrace:
val users: RDD[(VertexId, (String, String))] =
sc.parallelize(Array((3L, ("rxin", "student")),
(7L, ("jgonzal", "postdoc")),
(5L, ("franklin", "prof")),
(2L, ("istoica", "prof"))))
users = ParallelCollectionRDD[0] at parallelize at <console>:35
ParallelCollectionRDD[0] at parallelize at <console>:35
val relationships: RDD[Edge[String]] = sc.parallelize(Array(
Edge(3L, 7L, "collab"),
Edge(5L, 3L, "advisor"),
Edge(2L, 5L, "colleague"),
Edge(5L, 7L, "pi")
))
relationships = ParallelCollectionRDD[1] at parallelize at <console>:34
ParallelCollectionRDD[1] at parallelize at <console>:34
val defaultUser = ("John Doe", "Missing")
val graph = Graph(users, relationships, defaultUser)
defaultUser = (John Doe,Missing)
graph = org.apache.spark.graphx.impl.GraphImpl@7c40e7e8
org.apache.spark.graphx.impl.GraphImpl@7c40e7e8
graph.vertices.filter {case (id, (name, pos)) => pos == "postdoc"}.count
1
graph.vertices.filter {case (id, (name, pos)) => pos == "prof"}.count
2
graph.edges.filter(e => e.srcId < e.dstId).count
3
Graph 操作
详见 https://spark.apache.org/docs/latest/graphx-programming-guide.html 的Graph类
1. 图信息
//边数
graph.numEdges
4
//顶点数
graph.numVertices
4
//计算入度
graph.inDegrees.reduceByKey(_ + _).take(5)
Array((3,1), (5,1), (7,2))
//计算出度
graph.outDegrees.reduceByKey(_ + _).take(5)
Array((2,1), (3,1), (5,2))
//计算度
graph.degrees.reduceByKey(_ + _).collect()
Array((2,1), (3,2), (5,3), (7,2))
2.图视图
//顶点
graph.vertices.filter {case (id, (name, pos)) => pos == "postdoc"}.count
1
//边
graph.edges.filter(e => e.srcId < e.dstId).count
3
//返回三元组视图
graph.triplets.collect()
Array(((3,(rxin,student)),(7,(jgonzal,postdoc)),collab), ((5,(franklin,prof)),(3,(rxin,student)),advisor), ((2,(istoica,prof)),(5,(franklin,prof)),colleague), ((5,(franklin,prof)),(7,(jgonzal,postdoc)),pi))
3.图缓存
- persist
- cache
- unpersistVertices
4.分区
- partitionBy
5.顶点与边的转换
- mapVertices
- mapEdges
- mapTriplets
6.修改图结构
- reverse
- subgraph
- mask
- groupEdges
7.用图连接RDD
- joinVertices
- outerJoinVertices
8.汇集邻近的三元组信息
- collectNeighborIds
- collectNeighbors
- aggregateMessages
9.交互并行图计算
- pregel
10.基本图算法
- pageRank
- connectedComponents
- triangleCount
- stronglyConnectedComponents